aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-26 00:35:39 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-26 00:35:39 +0100
commitcf87f0e91f43f8039f9a04a717cba2a3f96ab4eb (patch)
treed9f3cadbe40ce4696693e861129ca78318021f58
parenta8d155cf574cdf5bf5ed1f7ec58f470fb724a7b3 (diff)
downloadfuzzycat-cf87f0e91f43f8039f9a04a717cba2a3f96ab4eb.tar.gz
fuzzycat-cf87f0e91f43f8039f9a04a717cba2a3f96ab4eb.zip
update notes
-rw-r--r--README.md24
-rw-r--r--fuzzycat/verify.py7
-rw-r--r--tests/data/verify.csv2
3 files changed, 32 insertions, 1 deletions
diff --git a/README.md b/README.md
index 0994d5e..6f90c80 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,29 @@ Example results over 10M docs:
}
```
+# A full run
+
+Single threaded, 42h.
+
+```
+$ time zstdcat -T0 release_export_expanded.json.zst | \
+ TMPDIR=/bigger/tmp python -m fuzzycat cluster --tmpdir /bigger/tmp -t tsandcrawler \ |
+ zstd -c9 > cluster_tsandcrawler.json.zst
+{
+ "key_fail": 0,
+ "key_ok": 154202433,
+ "key_empty": 942,
+ "key_denylist": 0,
+ "num_clusters": 124321361
+}
+
+real 2559m7.880s
+user 2605m41.347s
+sys 118m38.141s
+```
+
+So, 29881072 (about 20%) docs in the potentially duplicated set.
+
# Use cases
@@ -219,3 +242,4 @@ Ok cases are now in [verify.csv](https://github.com/miku/fuzzycat/blob/master/te
* [o] https://fatcat.wiki/release/lezvxt2oong6xm3e3cgp47wsla https://fatcat.wiki/release/aad6r5am6vfxpbfwycmyudp2qe Status.AMBIGUOUS OK.DUMMY
* [o] https://fatcat.wiki/release/5mzzswgebze2tk4apmbwjahp34 https://fatcat.wiki/release/vl7r3uewvvbo5i2gntocy3y2ey Status.AMBIGUOUS OK.DUMMY
+
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 63df679..54cabe4 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -56,6 +56,13 @@ WIPv1 (10m)
TODO: allow to pass in a DOI blacklist, e.g. a list of DOI which are not valid
any more; example: https://fatcat.wiki/release/azbcyqjnmrdofigpgk24ck4rpq,
https://fatcat.wiki/release/eb2uf5ae7bedxe22jasf2l3faa
+
+Author matching: one long string; e.g. as last name; take an acronym of the
+first name; asian names; number of authors; what works specifically for the
+various md extractors
+
+Contributor lists; "one that have the index set"; affiliations may end up
+there; "subset" is an ordered list; pubmed, crossref important
"""
import collections
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index 368c8c4..0e90ed7 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -85,4 +85,4 @@ jdtngtiz3bdqboypujoni2x3ry,byh7xr5qhjca3bw53ivdotck3e,Status.EXACT,
hqrvhbvocvaabg6nr5p43tl3uq,zfwf3tefajc6zdxa47vgilm7wm,TODO,
ppnzru2opnhxlai7pcmo7phe4i,iitldffmnncijgnf6ujb6zmdfu,Status.DIFFERENT,Miss.NUM_DIFF
fv35r37pb5c5tioyqburswsute,poeywm5o4raljhatd6zvehcicy,Status.EXACT,OK.TITLE_AUTHOR_MATCH
-s6znyezm4fdqfiihwcdtfapfqu,wz6kycfyqfdhhbcjteotw4jcbu,,
+s6znyezm4fdqfiihwcdtfapfqu,wz6kycfyqfdhhbcjteotw4jcbu,Status.STRONG,OK.PREPRINT_PUBLISHED