From 2b216f17fccf6ff90b41ca972bf1730078cc6180 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 22 Oct 2020 20:15:46 +0200
Subject: update notes on cluster, nb

---
 .gitignore                                    |   1 +
 fuzzycat/cluster.py                           |   1 +
 notebooks/Cluster Size and Title Length.ipynb | 553 ++++++++++++++++++++++++++
 notes/Clustering.md                           |  48 ++-
 4 files changed, 602 insertions(+), 1 deletion(-)
 create mode 100644 notebooks/Cluster Size and Title Length.ipynb
diff --git a/.gitignore b/.gitignore
index a65a688..a1e72a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -132,3 +132,4 @@ dmypy.json
 /data
 /names.db
 /tmp
+fixtures/cluster_title_normalized_dups_size_keylen.tsv
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 3b7f3f5..6c68bfc 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -48,6 +48,7 @@ DEFAULT_CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "fuzzycat")
 def sort_by_column(filename, mode="w", opts="-k 2", fast=True, prefix="fuzzycat-"):
     """
     Sort tabular file with sort(1), returns the filename of the sorted file.
+    XXX: use separate /fast/tmp for sort.
     """
     with tempfile.NamedTemporaryFile(delete=False, mode=mode, prefix=prefix) as tf:
         env = os.environ.copy()
diff --git a/notebooks/Cluster Size and Title Length.ipynb b/notebooks/Cluster Size and Title Length.ipynb
new file mode 100644
index 0000000..b78ba8b
--- /dev/null
+++ b/notebooks/Cluster Size and Title Length.ipynb	
@@ -0,0 +1,553 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"../fixtures/cluster_title_normalized_dups_size_keylen.tsv\", sep=\"\\t\", names=[\"size\", \"len\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "5818143"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>size</th>\n",
+       "      <th>len</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>264</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   size  len\n",
+       "0   264    0\n",
+       "1     2    3\n",
+       "2     2    3\n",
+       "3     2    3\n",
+       "4     2    4"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.set_option('display.float_format', lambda x: '%.3f' % x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "             size         len\n",
+      "count 5818143.000 5818143.000\n",
+      "mean        4.350      52.120\n",
+      "std       196.347      35.026\n",
+      "min         2.000       0.000\n",
+      "25%         2.000      24.000\n",
+      "50%         2.000      46.000\n",
+      "75%         3.000      72.000\n",
+      "max    151383.000   11686.000\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(df.describe())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>size</th>\n",
+       "      <th>len</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>264</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>187</th>\n",
+       "      <td>5</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>276</th>\n",
+       "      <td>28</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>284</th>\n",
+       "      <td>7</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>288</th>\n",
+       "      <td>6</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5818054</th>\n",
+       "      <td>7</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5818060</th>\n",
+       "      <td>6</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5818104</th>\n",
+       "      <td>6</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5818118</th>\n",
+       "      <td>5</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5818128</th>\n",
+       "      <td>13</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>448170 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         size  len\n",
+       "0         264    0\n",
+       "187         5    1\n",
+       "276        28   11\n",
+       "284         7    6\n",
+       "288         6    6\n",
+       "...       ...  ...\n",
+       "5818054     7    2\n",
+       "5818060     6    4\n",
+       "5818104     6    2\n",
+       "5818118     5    4\n",
+       "5818128    13    4\n",
+       "\n",
+       "[448170 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[df[\"size\"] > 4]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>size</th>\n",
+       "      <th>len</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>264</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>276</th>\n",
+       "      <td>28</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>314</th>\n",
+       "      <td>195</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>329</th>\n",
+       "      <td>10</td>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>364</th>\n",
+       "      <td>98</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5817734</th>\n",
+       "      <td>18</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5817835</th>\n",
+       "      <td>11</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5817886</th>\n",
+       "      <td>20</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5817901</th>\n",
+       "      <td>15</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5818128</th>\n",
+       "      <td>13</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>159500 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         size  len\n",
+       "0         264    0\n",
+       "276        28   11\n",
+       "314       195   15\n",
+       "329        10   14\n",
+       "364        98   15\n",
+       "...       ...  ...\n",
+       "5817734    18    5\n",
+       "5817835    11    4\n",
+       "5817886    20    5\n",
+       "5817901    15   10\n",
+       "5818128    13    4\n",
+       "\n",
+       "[159500 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[df[\"size\"] >= 10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>size</th>\n",
+       "      <th>len</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>264</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>314</th>\n",
+       "      <td>195</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>428</th>\n",
+       "      <td>122</td>\n",
+       "      <td>31</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>525</th>\n",
+       "      <td>173</td>\n",
+       "      <td>28</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>727</th>\n",
+       "      <td>270</td>\n",
+       "      <td>31</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5816100</th>\n",
+       "      <td>147</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5817345</th>\n",
+       "      <td>167</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5817361</th>\n",
+       "      <td>258</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5817366</th>\n",
+       "      <td>298</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5817374</th>\n",
+       "      <td>252</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>9610 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         size  len\n",
+       "0         264    0\n",
+       "314       195   15\n",
+       "428       122   31\n",
+       "525       173   28\n",
+       "727       270   31\n",
+       "...       ...  ...\n",
+       "5816100   147    4\n",
+       "5817345   167    2\n",
+       "5817361   258    2\n",
+       "5817366   298    2\n",
+       "5817374   252    2\n",
+       "\n",
+       "[9610 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[df[\"size\"] > 100]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "5818143"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "fuzzycat",
+   "language": "python",
+   "name": "fuzzycat"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notes/Clustering.md b/notes/Clustering.md
index d794bdc..95baea3 100644
--- a/notes/Clustering.md
+++ b/notes/Clustering.md
@@ -16,7 +16,7 @@ Identification and Intelligence System, ...):
 $ zstdcat -T0 release_export_expanded.json.zst | fuzzycat-cluster -t title > cluster_title.json
 ```
 
-Parallel:
+Parallel (use `--pipepart`):
 
 ```
 $ zstdcat -T0 release_export_expanded.json.zst | \
@@ -32,6 +32,52 @@ Numbers of clusters:
   119829458 cluster_title_nysiis.json
 ```
 
+The number of duplicate record goes up as number of clusters go down:
+
+```
+   2858088 cluster_title_dups.json
+   5818143 cluster_title_normalized_dups.json
+   6274940 cluster_title_nysiis_dups.json
+```
+
+# Cluster numbers
+
+Using normalized title as example:
+
+* 4306860 have cluster size 2, 1511283 have cluster size 3 or larger
+
+```
+             size         len
+count 5818143.000 5818143.000
+mean        4.350      52.120
+std       196.347      35.026
+min         2.000       0.000
+25%         2.000      24.000
+50%         2.000      46.000
+75%         3.000      72.000
+max    151383.000   11686.000
+```
+
+Around 448170 clusters with size 5 or more (with some example titles):
+
+```
+Medical Notes
+日本鉄鋼協会第97回講演大会講演概要
+Boutades
+Allergic Contact Dermatitis
+Comité international
+Incontinence
+Efficient Uncertainty Minimization for Fuzzy Spectral Clustering
+Early Intervention
+CURRENT READINGS IN NUCLEAR MEDICINE
+Nannocystis exedens
+```
+
+Grouping. API, hide.
+
+* gnu parallel; top, htop; how much; "chunks"; read one line; "pipeart";
+  batching; "read from a file"; scan a file; "chunking"
+
 # TODO
 
 * [ ] do a SS like clustering, using title and author ngrams
-- 
cgit v1.2.3


	size	len
0	264	0
187	5	1
276	28	11
284	7	6
288	6	6
...	...	...
5818054	7	2
5818060	6	4
5818104	6	2
5818118	5	4
5818128	13	4
	size	len
0	264	0
276	28	11
314	195	15
329	10	14
364	98	15
...	...	...
5817734	18	5
5817835	11	4
5817886	20	5
5817901	15	10
5818128	13	4
	size	len
0	264	0
314	195	15
428	122	31
525	173	28
727	270	31
...	...	...
5816100	147	4
5817345	167	2
5817361	258	2
5817366	298	2
5817374	252	2