From cf87f0e91f43f8039f9a04a717cba2a3f96ab4eb Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 26 Nov 2020 00:35:39 +0100 Subject: update notes --- README.md | 24 ++++++++++++++++++++++++ fuzzycat/verify.py | 7 +++++++ tests/data/verify.csv | 2 +- 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0994d5e..6f90c80 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,29 @@ Example results over 10M docs: } ``` +# A full run + +Single threaded, 42h. + +``` +$ time zstdcat -T0 release_export_expanded.json.zst | \ + TMPDIR=/bigger/tmp python -m fuzzycat cluster --tmpdir /bigger/tmp -t tsandcrawler \ | + zstd -c9 > cluster_tsandcrawler.json.zst +{ + "key_fail": 0, + "key_ok": 154202433, + "key_empty": 942, + "key_denylist": 0, + "num_clusters": 124321361 +} + +real 2559m7.880s +user 2605m41.347s +sys 118m38.141s +``` + +So, 29881072 (about 20%) docs in the potentially duplicated set. + # Use cases @@ -219,3 +242,4 @@ Ok cases are now in [verify.csv](https://github.com/miku/fuzzycat/blob/master/te * [o] https://fatcat.wiki/release/lezvxt2oong6xm3e3cgp47wsla https://fatcat.wiki/release/aad6r5am6vfxpbfwycmyudp2qe Status.AMBIGUOUS OK.DUMMY * [o] https://fatcat.wiki/release/5mzzswgebze2tk4apmbwjahp34 https://fatcat.wiki/release/vl7r3uewvvbo5i2gntocy3y2ey Status.AMBIGUOUS OK.DUMMY + diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 63df679..54cabe4 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -56,6 +56,13 @@ WIPv1 (10m) TODO: allow to pass in a DOI blacklist, e.g. a list of DOI which are not valid any more; example: https://fatcat.wiki/release/azbcyqjnmrdofigpgk24ck4rpq, https://fatcat.wiki/release/eb2uf5ae7bedxe22jasf2l3faa + +Author matching: one long string; e.g. as last name; take an acronym of the +first name; asian names; number of authors; what works specifically for the +various md extractors + +Contributor lists; "one that have the index set"; affiliations may end up +there; "subset" is an ordered list; pubmed, crossref important """ import collections diff --git a/tests/data/verify.csv b/tests/data/verify.csv index 368c8c4..0e90ed7 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -85,4 +85,4 @@ jdtngtiz3bdqboypujoni2x3ry,byh7xr5qhjca3bw53ivdotck3e,Status.EXACT, hqrvhbvocvaabg6nr5p43tl3uq,zfwf3tefajc6zdxa47vgilm7wm,TODO, ppnzru2opnhxlai7pcmo7phe4i,iitldffmnncijgnf6ujb6zmdfu,Status.DIFFERENT,Miss.NUM_DIFF fv35r37pb5c5tioyqburswsute,poeywm5o4raljhatd6zvehcicy,Status.EXACT,OK.TITLE_AUTHOR_MATCH -s6znyezm4fdqfiihwcdtfapfqu,wz6kycfyqfdhhbcjteotw4jcbu,, +s6znyezm4fdqfiihwcdtfapfqu,wz6kycfyqfdhhbcjteotw4jcbu,Status.STRONG,OK.PREPRINT_PUBLISHED -- cgit v1.2.3