diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | fuzzycat/cluster.py | 1 | ||||
-rw-r--r-- | notebooks/Cluster Size and Title Length.ipynb | 553 | ||||
-rw-r--r-- | notes/Clustering.md | 48 |
4 files changed, 602 insertions, 1 deletions
@@ -132,3 +132,4 @@ dmypy.json /data /names.db /tmp +fixtures/cluster_title_normalized_dups_size_keylen.tsv diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 3b7f3f5..6c68bfc 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -48,6 +48,7 @@ DEFAULT_CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "fuzzycat") def sort_by_column(filename, mode="w", opts="-k 2", fast=True, prefix="fuzzycat-"): """ Sort tabular file with sort(1), returns the filename of the sorted file. + XXX: use separate /fast/tmp for sort. """ with tempfile.NamedTemporaryFile(delete=False, mode=mode, prefix=prefix) as tf: env = os.environ.copy() diff --git a/notebooks/Cluster Size and Title Length.ipynb b/notebooks/Cluster Size and Title Length.ipynb new file mode 100644 index 0000000..b78ba8b --- /dev/null +++ b/notebooks/Cluster Size and Title Length.ipynb @@ -0,0 +1,553 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"../fixtures/cluster_title_normalized_dups_size_keylen.tsv\", sep=\"\\t\", names=[\"size\", \"len\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5818143" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>size</th>\n", + " <th>len</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>264</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>2</td>\n", + " <td>4</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " size len\n", + "0 264 0\n", + "1 2 3\n", + "2 2 3\n", + "3 2 3\n", + "4 2 4" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.float_format', lambda x: '%.3f' % x)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " size len\n", + "count 5818143.000 5818143.000\n", + "mean 4.350 52.120\n", + "std 196.347 35.026\n", + "min 2.000 0.000\n", + "25% 2.000 24.000\n", + "50% 2.000 46.000\n", + "75% 3.000 72.000\n", + "max 151383.000 11686.000\n" + ] + } + ], + "source": [ + "print(df.describe())" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>size</th>\n", + " <th>len</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>264</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>187</th>\n", + " <td>5</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>276</th>\n", + " <td>28</td>\n", + " <td>11</td>\n", + " </tr>\n", + " <tr>\n", + " <th>284</th>\n", + " <td>7</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>288</th>\n", + " <td>6</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5818054</th>\n", + " <td>7</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5818060</th>\n", + " <td>6</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5818104</th>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5818118</th>\n", + " <td>5</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5818128</th>\n", + " <td>13</td>\n", + " <td>4</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>448170 rows × 2 columns</p>\n", + "</div>" + ], + "text/plain": [ + " size len\n", + "0 264 0\n", + "187 5 1\n", + "276 28 11\n", + "284 7 6\n", + "288 6 6\n", + "... ... ...\n", + "5818054 7 2\n", + "5818060 6 4\n", + "5818104 6 2\n", + "5818118 5 4\n", + "5818128 13 4\n", + "\n", + "[448170 rows x 2 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df[\"size\"] > 4]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>size</th>\n", + " <th>len</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>264</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>276</th>\n", + " <td>28</td>\n", + " <td>11</td>\n", + " </tr>\n", + " <tr>\n", + " <th>314</th>\n", + " <td>195</td>\n", + " <td>15</td>\n", + " </tr>\n", + " <tr>\n", + " <th>329</th>\n", + " <td>10</td>\n", + " <td>14</td>\n", + " </tr>\n", + " <tr>\n", + " <th>364</th>\n", + " <td>98</td>\n", + " <td>15</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5817734</th>\n", + " <td>18</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5817835</th>\n", + " <td>11</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5817886</th>\n", + " <td>20</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5817901</th>\n", + " <td>15</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5818128</th>\n", + " <td>13</td>\n", + " <td>4</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>159500 rows × 2 columns</p>\n", + "</div>" + ], + "text/plain": [ + " size len\n", + "0 264 0\n", + "276 28 11\n", + "314 195 15\n", + "329 10 14\n", + "364 98 15\n", + "... ... ...\n", + "5817734 18 5\n", + "5817835 11 4\n", + "5817886 20 5\n", + "5817901 15 10\n", + "5818128 13 4\n", + "\n", + "[159500 rows x 2 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df[\"size\"] >= 10]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>size</th>\n", + " <th>len</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>264</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>314</th>\n", + " <td>195</td>\n", + " <td>15</td>\n", + " </tr>\n", + " <tr>\n", + " <th>428</th>\n", + " <td>122</td>\n", + " <td>31</td>\n", + " </tr>\n", + " <tr>\n", + " <th>525</th>\n", + " <td>173</td>\n", + " <td>28</td>\n", + " </tr>\n", + " <tr>\n", + " <th>727</th>\n", + " <td>270</td>\n", + " <td>31</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5816100</th>\n", + " <td>147</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5817345</th>\n", + " <td>167</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5817361</th>\n", + " <td>258</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5817366</th>\n", + " <td>298</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5817374</th>\n", + " <td>252</td>\n", + " <td>2</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>9610 rows × 2 columns</p>\n", + "</div>" + ], + "text/plain": [ + " size len\n", + "0 264 0\n", + "314 195 15\n", + "428 122 31\n", + "525 173 28\n", + "727 270 31\n", + "... ... ...\n", + "5816100 147 4\n", + "5817345 167 2\n", + "5817361 258 2\n", + "5817366 298 2\n", + "5817374 252 2\n", + "\n", + "[9610 rows x 2 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df[\"size\"] > 100]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5818143" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fuzzycat", + "language": "python", + "name": "fuzzycat" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notes/Clustering.md b/notes/Clustering.md index d794bdc..95baea3 100644 --- a/notes/Clustering.md +++ b/notes/Clustering.md @@ -16,7 +16,7 @@ Identification and Intelligence System, ...): $ zstdcat -T0 release_export_expanded.json.zst | fuzzycat-cluster -t title > cluster_title.json ``` -Parallel: +Parallel (use `--pipepart`): ``` $ zstdcat -T0 release_export_expanded.json.zst | \ @@ -32,6 +32,52 @@ Numbers of clusters: 119829458 cluster_title_nysiis.json ``` +The number of duplicate record goes up as number of clusters go down: + +``` + 2858088 cluster_title_dups.json + 5818143 cluster_title_normalized_dups.json + 6274940 cluster_title_nysiis_dups.json +``` + +# Cluster numbers + +Using normalized title as example: + +* 4306860 have cluster size 2, 1511283 have cluster size 3 or larger + +``` + size len +count 5818143.000 5818143.000 +mean 4.350 52.120 +std 196.347 35.026 +min 2.000 0.000 +25% 2.000 24.000 +50% 2.000 46.000 +75% 3.000 72.000 +max 151383.000 11686.000 +``` + +Around 448170 clusters with size 5 or more (with some example titles): + +``` +Medical Notes +日本鉄鋼協会第97回講演大会講演概要 +Boutades +Allergic Contact Dermatitis +Comité international +Incontinence +Efficient Uncertainty Minimization for Fuzzy Spectral Clustering +Early Intervention +CURRENT READINGS IN NUCLEAR MEDICINE +Nannocystis exedens +``` + +Grouping. API, hide. + +* gnu parallel; top, htop; how much; "chunks"; read one line; "pipeart"; + batching; "read from a file"; scan a file; "chunking" + # TODO * [ ] do a SS like clustering, using title and author ngrams |