aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/verify.py4
-rw-r--r--notes/2020_11_testruns.md31
2 files changed, 33 insertions, 2 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index dd57506..e7b397f 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -258,8 +258,8 @@ def compare(a, b):
if a_slug_title == b_slug_title:
try:
- a_subtitles = glom(a, "extra.subtitle")
- b_subtitles = glom(b, "extra.subtitle")
+ a_subtitles = glom(a, "extra.subtitle") or []
+ b_subtitles = glom(b, "extra.subtitle") or []
for a_sub in a_subtitles:
for b_sub in b_subtitles:
if slugify_string(a_sub) != slugify_string(b_sub):
diff --git a/notes/2020_11_testruns.md b/notes/2020_11_testruns.md
new file mode 100644
index 0000000..31c292c
--- /dev/null
+++ b/notes/2020_11_testruns.md
@@ -0,0 +1,31 @@
+# Test runs
+
+## Using --min-cluster-size
+
+Skipping writes of single element clusters cuts clustering from ~42h to ~22h.
+
+```
+$ time zstdcat -T0 release_export_expanded.json.zst | \
+ TMPDIR=/bigger/tmp python -m fuzzycat cluster --min-cluster-size 2 \
+ --tmpdir /bigger/tmp -t tsandcrawler | \
+ zstd -c9 > cluster_tsandcrawler_min_cluster_size_2.json.zst
+...
+max cluster size cut off for: 雜報その1
+max cluster size cut off for: 雜録
+2020-11-27 18:31:39.825 DEBUG __main__ - run_cluster: {"key_fail": 0, "key_ok":
+154202433, "key_empty": 942, "key_denylist": 0, "num_clusters": 11763096}
+
+real 1328m46.994s
+user 1088m6.837s
+sys 98m17.501s
+```
+
+We find 11763096 clusters, 16GB compressed (zstdcat takes about 5min,
+sequential read at 50M/s).
+
+```
+$ time zstdcat -T0 cluster_tsandcrawler_min_cluster_size_2.json.zst | \
+ python -m fuzzycat verify | \
+ zstd -T0 -c9 > cluster_tsandcrawler_min_cluster_size_2_verify.tsv.zst
+```
+