diff options
-rw-r--r-- | notes/dblp_hacking.txt | 72 | ||||
-rw-r--r-- | python/Pipfile | 4 | ||||
-rw-r--r-- | python/Pipfile.lock | 94 | ||||
-rw-r--r-- | python/tests/transform_csl.py | 5 | ||||
-rw-r--r-- | python/tests/web_citation_csl.py | 2 |
5 files changed, 147 insertions, 30 deletions
diff --git a/notes/dblp_hacking.txt b/notes/dblp_hacking.txt new file mode 100644 index 00000000..6ebcdc45 --- /dev/null +++ b/notes/dblp_hacking.txt @@ -0,0 +1,72 @@ + +Notes from fall 2020 + +## prefix counts + + # of conferences: 5,329 + # of journals: 1,724 + + zcat dblp.xml.gz | rg "key=" | rg "mdate=" | cut -f3 -d' ' | cut -f2 -d'"' | pv -l > keys.txt + => 8.00M + + cat keys.txt | cut -f1 -d/ | sort | uniq -c | sort -nr + 2764029 conf + 2640949 homepages + 2431614 journals + 77682 phd + 37402 books + 27830 reference + 19153 series + 555 tr + tr/ibm/LILOG34 + tr/sql/X3H2-90-292 + 16 persons + 15 www + www/org/w3/TR/xquery + www/org/mitre/future + 6 ms + 3 dblpnote + + cat keys.txt | cut -f1-2 -d/ | sort -u | cut -f1 -d/ | sort | uniq -c | sort -nr + 5138 conf + 1725 journals + 291 homepages + 125 phd + 96 series + 77 books + 60 reference + 16 persons + 9 tr + 6 ms + 3 dblpnote + 2 www + +Fetch all the HTML: + + shuf prefixes.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html + +Got blocked; supposed to do only one per minute. Delete missing and try again with `-j1` not `-j4`: + + find . -empty -type f -delete + +Roughly 500x in 2:38 + +TODO: wrap this script so it iterates over filenames, instead of one-per-call + +## Dev Import Counts + +Counter({'total': 7953365, 'has-doi': 4277307, 'skip': 2953841, 'skip-key-type': 2640968, 'skip-arxiv-corr': 312872, 'skip-title': 1, 'insert': 0, 'update': 0, 'exists': 0}) + +Container imports: + + # blank database + Counter({'total': 6954, 'insert': 6944, 'skip-update': 10, 'skip': 0, 'update': 0, 'exists': 0}) + + # repeated + Counter({'total': 6954, 'insert': 5325, 'skip-update': 1629, 'skip': 0, 'update': 0, 'exists': 0}) + + # repeated with previous complete TSV file + Counter({'total': 6954, 'skip-update': 6954, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) + + +./fatcat_import.py dblp-release --dblp-container-map-file /data/dblp/all_dblp_containers.tsv /data/dblp/dblp.xml diff --git a/python/Pipfile b/python/Pipfile index 32969ac1..4b9f086b 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -55,7 +55,9 @@ pycountry = "==19.8.18" tldextract = "==3.*" toml = "==0.10.*" fuzzycat = "==0.1.21" -dynaconf = "*" +dynaconf = ">=3" +pydantic = "==1.*" +surt = "==0.3.*" [requires] # As of Fall 2020, Internet Archive cluster VMs are split between Ubuntu Xenial diff --git a/python/Pipfile.lock b/python/Pipfile.lock index fd41be8e..58c8e941 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "7529727ff616348ece2031bdb1c5440cb081f6ccc2c68dff8f1c40f698eaf76c" + "sha256": "2de1ee43cc2aaf80f93e1c8475a237442dd63196626f581f80015f21689d7de7" }, "pipfile-spec": 6, "requires": { @@ -133,11 +133,11 @@ }, "citeproc-py-styles": { "hashes": [ - "sha256:41107b2831f5490a13a95e8e2583101e0485968b2f6affb42d84e4e7e272dcf7", - "sha256:6c94a83b2d785345eff5471eafe4bcf794ee7fd5ba2921a9813b0814dfb62bf7" + "sha256:22183eeab9f5c21811423a3d5e750e10c4ceb817ee9cb36814368fc8c84a6d78", + "sha256:94062470b4e6f0deb801e6a2c03fe1672a2f97dd0d8c1882c92ff60725d16405" ], "index": "pypi", - "version": "==0.1.2" + "version": "==0.1.3" }, "click": { "hashes": [ @@ -209,11 +209,11 @@ }, "elasticsearch": { "hashes": [ - "sha256:8213d6b7d3f984a23e2f8b6ff63366224b3de9129839ed0cb8195d9e8339dc85", - "sha256:da3de0451c61e9357cec2f8bf32d8aea65974e5717b5deef53718392cf4c2985" + "sha256:a09ae1de8869efa6ef2d9a0a9b9f6d9260b0c2506e83dd32bc1119a23fff49a5", + "sha256:d6bcca0b2e5665d08e6fe6fadc2d4d321affd76ce483603078fc9d3ccd2bc0f9" ], "index": "pypi", - "version": "==7.13.0" + "version": "==7.13.1" }, "elasticsearch-dsl": { "hashes": [ @@ -508,6 +508,34 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.20" }, + "pydantic": { + "hashes": [ + "sha256:021ea0e4133e8c824775a0cfe098677acf6fa5a3cbf9206a376eed3fc09302cd", + "sha256:05ddfd37c1720c392f4e0d43c484217b7521558302e7069ce8d318438d297739", + "sha256:05ef5246a7ffd2ce12a619cbb29f3307b7c4509307b1b49f456657b43529dc6f", + "sha256:10e5622224245941efc193ad1d159887872776df7a8fd592ed746aa25d071840", + "sha256:18b5ea242dd3e62dbf89b2b0ec9ba6c7b5abaf6af85b95a97b00279f65845a23", + "sha256:234a6c19f1c14e25e362cb05c68afb7f183eb931dd3cd4605eafff055ebbf287", + "sha256:244ad78eeb388a43b0c927e74d3af78008e944074b7d0f4f696ddd5b2af43c62", + "sha256:26464e57ccaafe72b7ad156fdaa4e9b9ef051f69e175dbbb463283000c05ab7b", + "sha256:41b542c0b3c42dc17da70554bc6f38cbc30d7066d2c2815a94499b5684582ecb", + "sha256:4a03cbbe743e9c7247ceae6f0d8898f7a64bb65800a45cbdc52d65e370570820", + "sha256:4be75bebf676a5f0f87937c6ddb061fa39cbea067240d98e298508c1bda6f3f3", + "sha256:54cd5121383f4a461ff7644c7ca20c0419d58052db70d8791eacbbe31528916b", + "sha256:589eb6cd6361e8ac341db97602eb7f354551482368a37f4fd086c0733548308e", + "sha256:8621559dcf5afacf0069ed194278f35c255dc1a1385c28b32dd6c110fd6531b3", + "sha256:8b223557f9510cf0bfd8b01316bf6dd281cf41826607eada99662f5e4963f316", + "sha256:99a9fc39470010c45c161a1dc584997f1feb13f689ecf645f59bb4ba623e586b", + "sha256:a7c6002203fe2c5a1b5cbb141bb85060cbff88c2d78eccbc72d97eb7022c43e4", + "sha256:a83db7205f60c6a86f2c44a61791d993dff4b73135df1973ecd9eed5ea0bda20", + "sha256:ac8eed4ca3bd3aadc58a13c2aa93cd8a884bcf21cb019f8cfecaae3b6ce3746e", + "sha256:e710876437bc07bd414ff453ac8ec63d219e7690128d925c6e82889d674bb505", + "sha256:ea5cb40a3b23b3265f6325727ddfc45141b08ed665458be8c6285e7b85bd73a1", + "sha256:fec866a0b59f372b7e776f2d7308511784dace622e0992a0b59ea3ccee0ae833" + ], + "index": "pypi", + "version": "==1.8.2" + }, "pygal": { "hashes": [ "sha256:27abab93cbc31e21f3c6bdecc05bda6cd3570cbdbd8297b7caa6904051b50d72", @@ -572,19 +600,19 @@ }, "python-dotenv": { "hashes": [ - "sha256:00aa34e92d992e9f8383730816359647f358f4a3be1ba45e5a5cefd27ee91544", - "sha256:b1ae5e9643d5ed987fc57cc2583021e38db531946518130777734f9589b3141f" + "sha256:dd8fe852847f4fbfadabf6183ddd4c824a9651f02d51714fa075c95561959c7d", + "sha256:effaac3c1e58d89b3ccb4d04a40dc7ad6e0275fda25fd75ae9d323e2465e202d" ], "index": "pypi", - "version": "==0.17.1" + "version": "==0.18.0" }, "python-magic": { "hashes": [ - "sha256:3790dc06e5abf9d618d288adf831f159ca9a3872aac7bcb8b7008b4a08d9809c", - "sha256:88f71d04fde4318da32fa03930362f1c6127caa833614563fd53a0fd3438cc3e" + "sha256:4fec8ee805fea30c07afccd1592c0f17977089895bdfaae5fec870a84e997626", + "sha256:de800df9fb50f8ec5974761054a708af6e4246b03b4bdaee993f948947b0ebcf" ], "index": "pypi", - "version": "==0.4.23" + "version": "==0.4.24" }, "python-snappy": { "hashes": [ @@ -773,6 +801,13 @@ "markers": "python_version >= '3.0'", "version": "==2.2.1" }, + "surt": { + "hashes": [ + "sha256:24167eb6c01f24f757eef9bca6bf0ec089ec05ad5b6213c3b727a5e58c0c4720" + ], + "index": "pypi", + "version": "==0.3.1" + }, "tldextract": { "hashes": [ "sha256:cfae9bc8bda37c3e8c7c8639711ad20e95dc85b207a256b60b0b23d7ff5540ea", @@ -789,6 +824,14 @@ "index": "pypi", "version": "==0.10.2" }, + "typing-extensions": { + "hashes": [ + "sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497", + "sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342", + "sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84" + ], + "version": "==3.10.0.0" + }, "tzlocal": { "hashes": [ "sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44", @@ -885,7 +928,7 @@ "sha256:f98fc5750aac2d63d482909184aac72a979bfd123b112ec53fd365104ea15b1c", "sha256:ff5b75f94101beaa373f1511319580a010f6e03458ee51b1a386d7de5331440a" ], - "markers": "python_version >= '3.5'", + "markers": "python_full_version >= '3.5.0'", "version": "==0.15.2" } }, @@ -929,7 +972,6 @@ "version": "==4.0.0" }, "coverage": { - "extras": [], "hashes": [ "sha256:004d1880bed2d97151facef49f08e255a20ceb6f9432df75f4eef018fdd5a78c", "sha256:01d84219b5cdbfc8122223b39a954820929497a1cb1422824bb86b07b74594b6", @@ -984,7 +1026,7 @@ "sha256:f0b278ce10936db1a37e6954e15a3730bea96a0997c26d7fee88e6c396c2086d", "sha256:f11642dddbb0253cc8853254301b51390ba0081750a8ac03f20ea8103f0c56b6" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", "version": "==5.5" }, "decorator": { @@ -1013,11 +1055,11 @@ }, "ipython": { "hashes": [ - "sha256:a171caa3d3d4c819a1c0742e3abecfd5a2b8ab525ca1c9f114b40b76b0679ab1", - "sha256:f86788eef439891438af3498525094cc2acbdbea4f2aa2f8895782d4ff471341" + "sha256:9bc24a99f5d19721fb8a2d1408908e9c0520a17fff2233ffe82620847f17f1b6", + "sha256:d513e93327cf8657d6467c81f1f894adc125334ffe0e4ddd1abbb1c78d828703" ], "index": "pypi", - "version": "==7.24.0" + "version": "==7.24.1" }, "ipython-genutils": { "hashes": [ @@ -1028,11 +1070,11 @@ }, "isort": { "hashes": [ - "sha256:0a943902919f65c5684ac4e0154b1ad4fac6dcaa5d9f3426b732f1c8b5419be6", - "sha256:2bb1680aad211e3c9944dbce1d4ba09a989f04e238296c87fe2139faa26d655d" + "sha256:83510593e07e433b77bd5bff0f6f607dbafa06d1a89022616f02d8b699cfcd56", + "sha256:8e2c107091cfec7286bc0f68a547d0ba4c094d460b732075b6fba674f1035c0c" ], - "markers": "python_version >= '3.6' and python_version < '4'", - "version": "==5.8.0" + "markers": "python_version < '4.0' and python_full_version >= '3.6.1'", + "version": "==5.9.1" }, "jedi": { "hashes": [ @@ -1134,11 +1176,11 @@ }, "prompt-toolkit": { "hashes": [ - "sha256:bf00f22079f5fadc949f42ae8ff7f05702826a97059ffcc6281036ad40ac6f04", - "sha256:e1b4f11b9336a28fa11810bc623c357420f69dfdb6d2dac41ca2c21a55c033bc" + "sha256:08360ee3a3148bdb5163621709ee322ec34fc4375099afa4bbf751e9b7b7fa4f", + "sha256:7089d8d2938043508aa9420ec18ce0922885304cddae87fb96eebca942299f88" ], "markers": "python_full_version >= '3.6.1'", - "version": "==3.0.18" + "version": "==3.0.19" }, "ptyprocess": { "hashes": [ diff --git a/python/tests/transform_csl.py b/python/tests/transform_csl.py index 6436f876..77ce1bff 100644 --- a/python/tests/transform_csl.py +++ b/python/tests/transform_csl.py @@ -55,8 +55,9 @@ def test_csl_pubmed_bibtex(crossref_importer): number={4}, journal={Roczniki Panstwowego Zakladu Higieny}, author={Mędrela-Kuder and Szymura}, - year={2018}} + year={2018} + } """.strip() assert citeproc_csl(csl, 'harvard1', html=True).strip() == """ - Mędrela-Kuder & Szymura, 2018. Selected anti-health behaviours among women with osteoporosis. <i>Roczniki Panstwowego Zakladu Higieny</i>, 69`(4). + Mędrela-Kuder and Szymura (2018) ‘Selected anti-health behaviours among women with osteoporosis’, <i>Roczniki Panstwowego Zakladu Higieny</i>, 69`(4). doi: 10.32394/rpzh.2018.0046. """.strip() diff --git a/python/tests/web_citation_csl.py b/python/tests/web_citation_csl.py index fb3ce58d..50a2d6e8 100644 --- a/python/tests/web_citation_csl.py +++ b/python/tests/web_citation_csl.py @@ -23,7 +23,7 @@ def test_release_bibtex(app, api): json.loads(rv.data.decode('utf-8')) rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaam/citeproc?style=modern-language-association') assert rv.status_code == 200 - assert rv.data.decode('utf-8').startswith('Ioannidis, John. “Why Most Published Research Findings Are False”. 2.8 (2005)') + assert rv.data.decode('utf-8').startswith('Ioannidis, J.. Why Most Published Research Findings Are False') # "dummy" demo entity; very minimal metadata rv = app.get('/release/aaaaaaaaaaaaarceaaaaaaaaai') |