From bf142bb09fa00cdf0823b9216068c3160e143745 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 18 Nov 2020 00:46:36 +0100 Subject: update README --- README.md | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'README.md') diff --git a/README.md b/README.md index 33984b1..412495c 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,64 @@ Fuzzy matching publications for [fatcat](https://fatcat.wiki). Note: This is currently work-in-progress. +# Example Run + +Run any clustering algorithm. + +``` +$ time python -m fuzzycat cluster -t tsandcrawler < data/sample10m.json | \ + zstd -c9 > sample_cluster.json.zst +2020-11-18 00:19:48.194 DEBUG __main__ - run_cluster: + {"key_fail": 0, "key_ok": 9999938, "key_empty": 62, "key_denylist": 0, "num_clusters": 9040789} + +real 75m23.045s +user 95m14.455s +sys 3m39.121s +``` + +Run verification. + +``` +$ time zstdcat -T0 sample_cluster.json.zst | python -m fuzzycat verify > sample_verify.txt + +real 7m56.713s +user 8m50.703s +sys 0m29.262s +``` + + +Example results over 10M docs: + +```json +{ + "miss.appendix": 176, + "miss.arxiv_version": 25, + "miss.blacklisted": 12082, + "miss.blacklisted_fragment": 5, + "miss.book_chapter": 46733, + "miss.component": 1567, + "miss.contrib_intersection_empty": 47691, + "miss.dataset_doi": 30806, + "miss.num_diff": 1, + "miss.release_type": 157718, + "miss.short_title": 16263, + "miss.subtitle": 6013, + "miss.title_filename": 57, + "miss.year": 148755, + "ok.arxiv_version": 93, + "ok.dummy": 88294, + "ok.preprint_published": 110, + "ok.slug_title_author_match": 15818, + "ok.title_author_match": 93240, + "skip.container_name_blacklist": 20, + "skip.publisher_blacklist": 456, + "skip.too_large": 7430, + "skip.unique": 8808462, + "total": 9481815 +} +``` + + # Use cases * [ ] take a release entity database dump as JSON lines and cluster releases -- cgit v1.2.3