diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-26 00:35:39 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-26 00:35:39 +0100 |
commit | cf87f0e91f43f8039f9a04a717cba2a3f96ab4eb (patch) | |
tree | d9f3cadbe40ce4696693e861129ca78318021f58 | |
parent | a8d155cf574cdf5bf5ed1f7ec58f470fb724a7b3 (diff) | |
download | fuzzycat-cf87f0e91f43f8039f9a04a717cba2a3f96ab4eb.tar.gz fuzzycat-cf87f0e91f43f8039f9a04a717cba2a3f96ab4eb.zip |
update notes
-rw-r--r-- | README.md | 24 | ||||
-rw-r--r-- | fuzzycat/verify.py | 7 | ||||
-rw-r--r-- | tests/data/verify.csv | 2 |
3 files changed, 32 insertions, 1 deletions
@@ -59,6 +59,29 @@ Example results over 10M docs: } ``` +# A full run + +Single threaded, 42h. + +``` +$ time zstdcat -T0 release_export_expanded.json.zst | \ + TMPDIR=/bigger/tmp python -m fuzzycat cluster --tmpdir /bigger/tmp -t tsandcrawler \ | + zstd -c9 > cluster_tsandcrawler.json.zst +{ + "key_fail": 0, + "key_ok": 154202433, + "key_empty": 942, + "key_denylist": 0, + "num_clusters": 124321361 +} + +real 2559m7.880s +user 2605m41.347s +sys 118m38.141s +``` + +So, 29881072 (about 20%) docs in the potentially duplicated set. + # Use cases @@ -219,3 +242,4 @@ Ok cases are now in [verify.csv](https://github.com/miku/fuzzycat/blob/master/te * [o] https://fatcat.wiki/release/lezvxt2oong6xm3e3cgp47wsla https://fatcat.wiki/release/aad6r5am6vfxpbfwycmyudp2qe Status.AMBIGUOUS OK.DUMMY * [o] https://fatcat.wiki/release/5mzzswgebze2tk4apmbwjahp34 https://fatcat.wiki/release/vl7r3uewvvbo5i2gntocy3y2ey Status.AMBIGUOUS OK.DUMMY + diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 63df679..54cabe4 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -56,6 +56,13 @@ WIPv1 (10m) TODO: allow to pass in a DOI blacklist, e.g. a list of DOI which are not valid any more; example: https://fatcat.wiki/release/azbcyqjnmrdofigpgk24ck4rpq, https://fatcat.wiki/release/eb2uf5ae7bedxe22jasf2l3faa + +Author matching: one long string; e.g. as last name; take an acronym of the +first name; asian names; number of authors; what works specifically for the +various md extractors + +Contributor lists; "one that have the index set"; affiliations may end up +there; "subset" is an ordered list; pubmed, crossref important """ import collections diff --git a/tests/data/verify.csv b/tests/data/verify.csv index 368c8c4..0e90ed7 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -85,4 +85,4 @@ jdtngtiz3bdqboypujoni2x3ry,byh7xr5qhjca3bw53ivdotck3e,Status.EXACT, hqrvhbvocvaabg6nr5p43tl3uq,zfwf3tefajc6zdxa47vgilm7wm,TODO, ppnzru2opnhxlai7pcmo7phe4i,iitldffmnncijgnf6ujb6zmdfu,Status.DIFFERENT,Miss.NUM_DIFF fv35r37pb5c5tioyqburswsute,poeywm5o4raljhatd6zvehcicy,Status.EXACT,OK.TITLE_AUTHOR_MATCH -s6znyezm4fdqfiihwcdtfapfqu,wz6kycfyqfdhhbcjteotw4jcbu,, +s6znyezm4fdqfiihwcdtfapfqu,wz6kycfyqfdhhbcjteotw4jcbu,Status.STRONG,OK.PREPRINT_PUBLISHED |