2 files changed, 33 insertions, 2 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index dd57506..e7b397f 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -258,8 +258,8 @@ def compare(a, b):
 
     if a_slug_title == b_slug_title:
         try:
-            a_subtitles = glom(a, "extra.subtitle")
-            b_subtitles = glom(b, "extra.subtitle")
+            a_subtitles = glom(a, "extra.subtitle") or []
+            b_subtitles = glom(b, "extra.subtitle") or []
             for a_sub in a_subtitles:
                 for b_sub in b_subtitles:
                     if slugify_string(a_sub) != slugify_string(b_sub):
diff --git a/notes/2020_11_testruns.md b/notes/2020_11_testruns.md
new file mode 100644
index 0000000..31c292c
--- /dev/null
+++ b/notes/2020_11_testruns.md
@@ -0,0 +1,31 @@
+# Test runs
+
+## Using --min-cluster-size
+
+Skipping writes of single element clusters cuts clustering from ~42h to ~22h.
+
+```
+$ time zstdcat -T0 release_export_expanded.json.zst | \
+    TMPDIR=/bigger/tmp python -m fuzzycat cluster --min-cluster-size 2 \
+        --tmpdir /bigger/tmp -t tsandcrawler | \
+    zstd -c9 > cluster_tsandcrawler_min_cluster_size_2.json.zst
+...
+max cluster size cut off for: 雜報その1
+max cluster size cut off for: 雜録
+2020-11-27 18:31:39.825 DEBUG __main__ - run_cluster: {"key_fail": 0, "key_ok":
+154202433, "key_empty": 942, "key_denylist": 0, "num_clusters": 11763096}
+
+real    1328m46.994s
+user    1088m6.837s
+sys     98m17.501s
+```
+
+We find 11763096 clusters, 16GB compressed (zstdcat takes about 5min,
+sequential read at 50M/s).
+
+```
+$ time zstdcat -T0 cluster_tsandcrawler_min_cluster_size_2.json.zst | \
+    python -m fuzzycat verify | \
+    zstd -T0 -c9 > cluster_tsandcrawler_min_cluster_size_2_verify.tsv.zst
+```
+