aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-10-22 20:15:46 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-10-22 20:15:46 +0200
commit2b216f17fccf6ff90b41ca972bf1730078cc6180 (patch)
tree4cf53ef1d9cec359e81251eebbd6aff2ad04b4b5
parent38b45bc6738b0d53326ee6a62dff15fcb62cfa9c (diff)
downloadfuzzycat-2b216f17fccf6ff90b41ca972bf1730078cc6180.tar.gz
fuzzycat-2b216f17fccf6ff90b41ca972bf1730078cc6180.zip
update notes on cluster, nb
-rw-r--r--.gitignore1
-rw-r--r--fuzzycat/cluster.py1
-rw-r--r--notebooks/Cluster Size and Title Length.ipynb553
-rw-r--r--notes/Clustering.md48
4 files changed, 602 insertions, 1 deletions
diff --git a/.gitignore b/.gitignore
index a65a688..a1e72a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -132,3 +132,4 @@ dmypy.json
/data
/names.db
/tmp
+fixtures/cluster_title_normalized_dups_size_keylen.tsv
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 3b7f3f5..6c68bfc 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -48,6 +48,7 @@ DEFAULT_CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "fuzzycat")
def sort_by_column(filename, mode="w", opts="-k 2", fast=True, prefix="fuzzycat-"):
"""
Sort tabular file with sort(1), returns the filename of the sorted file.
+ XXX: use separate /fast/tmp for sort.
"""
with tempfile.NamedTemporaryFile(delete=False, mode=mode, prefix=prefix) as tf:
env = os.environ.copy()
diff --git a/notebooks/Cluster Size and Title Length.ipynb b/notebooks/Cluster Size and Title Length.ipynb
new file mode 100644
index 0000000..b78ba8b
--- /dev/null
+++ b/notebooks/Cluster Size and Title Length.ipynb
@@ -0,0 +1,553 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv(\"../fixtures/cluster_title_normalized_dups_size_keylen.tsv\", sep=\"\\t\", names=[\"size\", \"len\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "5818143"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>size</th>\n",
+ " <th>len</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>264</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>2</td>\n",
+ " <td>3</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>2</td>\n",
+ " <td>4</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " size len\n",
+ "0 264 0\n",
+ "1 2 3\n",
+ "2 2 3\n",
+ "3 2 3\n",
+ "4 2 4"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.set_option('display.float_format', lambda x: '%.3f' % x)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " size len\n",
+ "count 5818143.000 5818143.000\n",
+ "mean 4.350 52.120\n",
+ "std 196.347 35.026\n",
+ "min 2.000 0.000\n",
+ "25% 2.000 24.000\n",
+ "50% 2.000 46.000\n",
+ "75% 3.000 72.000\n",
+ "max 151383.000 11686.000\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(df.describe())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>size</th>\n",
+ " <th>len</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>264</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>187</th>\n",
+ " <td>5</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>276</th>\n",
+ " <td>28</td>\n",
+ " <td>11</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>284</th>\n",
+ " <td>7</td>\n",
+ " <td>6</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>288</th>\n",
+ " <td>6</td>\n",
+ " <td>6</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>...</th>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5818054</th>\n",
+ " <td>7</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5818060</th>\n",
+ " <td>6</td>\n",
+ " <td>4</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5818104</th>\n",
+ " <td>6</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5818118</th>\n",
+ " <td>5</td>\n",
+ " <td>4</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5818128</th>\n",
+ " <td>13</td>\n",
+ " <td>4</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "<p>448170 rows × 2 columns</p>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " size len\n",
+ "0 264 0\n",
+ "187 5 1\n",
+ "276 28 11\n",
+ "284 7 6\n",
+ "288 6 6\n",
+ "... ... ...\n",
+ "5818054 7 2\n",
+ "5818060 6 4\n",
+ "5818104 6 2\n",
+ "5818118 5 4\n",
+ "5818128 13 4\n",
+ "\n",
+ "[448170 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[df[\"size\"] > 4]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>size</th>\n",
+ " <th>len</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>264</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>276</th>\n",
+ " <td>28</td>\n",
+ " <td>11</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>314</th>\n",
+ " <td>195</td>\n",
+ " <td>15</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>329</th>\n",
+ " <td>10</td>\n",
+ " <td>14</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>364</th>\n",
+ " <td>98</td>\n",
+ " <td>15</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>...</th>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5817734</th>\n",
+ " <td>18</td>\n",
+ " <td>5</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5817835</th>\n",
+ " <td>11</td>\n",
+ " <td>4</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5817886</th>\n",
+ " <td>20</td>\n",
+ " <td>5</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5817901</th>\n",
+ " <td>15</td>\n",
+ " <td>10</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5818128</th>\n",
+ " <td>13</td>\n",
+ " <td>4</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "<p>159500 rows × 2 columns</p>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " size len\n",
+ "0 264 0\n",
+ "276 28 11\n",
+ "314 195 15\n",
+ "329 10 14\n",
+ "364 98 15\n",
+ "... ... ...\n",
+ "5817734 18 5\n",
+ "5817835 11 4\n",
+ "5817886 20 5\n",
+ "5817901 15 10\n",
+ "5818128 13 4\n",
+ "\n",
+ "[159500 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[df[\"size\"] >= 10]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>size</th>\n",
+ " <th>len</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>264</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>314</th>\n",
+ " <td>195</td>\n",
+ " <td>15</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>428</th>\n",
+ " <td>122</td>\n",
+ " <td>31</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>525</th>\n",
+ " <td>173</td>\n",
+ " <td>28</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>727</th>\n",
+ " <td>270</td>\n",
+ " <td>31</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>...</th>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5816100</th>\n",
+ " <td>147</td>\n",
+ " <td>4</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5817345</th>\n",
+ " <td>167</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5817361</th>\n",
+ " <td>258</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5817366</th>\n",
+ " <td>298</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5817374</th>\n",
+ " <td>252</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "<p>9610 rows × 2 columns</p>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " size len\n",
+ "0 264 0\n",
+ "314 195 15\n",
+ "428 122 31\n",
+ "525 173 28\n",
+ "727 270 31\n",
+ "... ... ...\n",
+ "5816100 147 4\n",
+ "5817345 167 2\n",
+ "5817361 258 2\n",
+ "5817366 298 2\n",
+ "5817374 252 2\n",
+ "\n",
+ "[9610 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[df[\"size\"] > 100]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "5818143"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "fuzzycat",
+ "language": "python",
+ "name": "fuzzycat"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notes/Clustering.md b/notes/Clustering.md
index d794bdc..95baea3 100644
--- a/notes/Clustering.md
+++ b/notes/Clustering.md
@@ -16,7 +16,7 @@ Identification and Intelligence System, ...):
$ zstdcat -T0 release_export_expanded.json.zst | fuzzycat-cluster -t title > cluster_title.json
```
-Parallel:
+Parallel (use `--pipepart`):
```
$ zstdcat -T0 release_export_expanded.json.zst | \
@@ -32,6 +32,52 @@ Numbers of clusters:
119829458 cluster_title_nysiis.json
```
+The number of duplicate record goes up as number of clusters go down:
+
+```
+ 2858088 cluster_title_dups.json
+ 5818143 cluster_title_normalized_dups.json
+ 6274940 cluster_title_nysiis_dups.json
+```
+
+# Cluster numbers
+
+Using normalized title as example:
+
+* 4306860 have cluster size 2, 1511283 have cluster size 3 or larger
+
+```
+ size len
+count 5818143.000 5818143.000
+mean 4.350 52.120
+std 196.347 35.026
+min 2.000 0.000
+25% 2.000 24.000
+50% 2.000 46.000
+75% 3.000 72.000
+max 151383.000 11686.000
+```
+
+Around 448170 clusters with size 5 or more (with some example titles):
+
+```
+Medical Notes
+日本鉄鋼協会第97回講演大会講演概要
+Boutades
+Allergic Contact Dermatitis
+Comité international
+Incontinence
+Efficient Uncertainty Minimization for Fuzzy Spectral Clustering
+Early Intervention
+CURRENT READINGS IN NUCLEAR MEDICINE
+Nannocystis exedens
+```
+
+Grouping. API, hide.
+
+* gnu parallel; top, htop; how much; "chunks"; read one line; "pipeart";
+ batching; "read from a file"; scan a file; "chunking"
+
# TODO
* [ ] do a SS like clustering, using title and author ngrams