aboutsummaryrefslogtreecommitdiffstats
path: root/notebooks/Cluster Size and Title Length.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'notebooks/Cluster Size and Title Length.ipynb')
-rw-r--r--notebooks/Cluster Size and Title Length.ipynb553
1 files changed, 0 insertions, 553 deletions
diff --git a/notebooks/Cluster Size and Title Length.ipynb b/notebooks/Cluster Size and Title Length.ipynb
deleted file mode 100644
index b78ba8b..0000000
--- a/notebooks/Cluster Size and Title Length.ipynb
+++ /dev/null
@@ -1,553 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "df = pd.read_csv(\"../fixtures/cluster_title_normalized_dups_size_keylen.tsv\", sep=\"\\t\", names=[\"size\", \"len\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "5818143"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(df)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>size</th>\n",
- " <th>len</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>264</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>2</td>\n",
- " <td>3</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>2</td>\n",
- " <td>3</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>2</td>\n",
- " <td>3</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>2</td>\n",
- " <td>4</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " size len\n",
- "0 264 0\n",
- "1 2 3\n",
- "2 2 3\n",
- "3 2 3\n",
- "4 2 4"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [],
- "source": [
- "pd.set_option('display.float_format', lambda x: '%.3f' % x)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " size len\n",
- "count 5818143.000 5818143.000\n",
- "mean 4.350 52.120\n",
- "std 196.347 35.026\n",
- "min 2.000 0.000\n",
- "25% 2.000 24.000\n",
- "50% 2.000 46.000\n",
- "75% 3.000 72.000\n",
- "max 151383.000 11686.000\n"
- ]
- }
- ],
- "source": [
- "print(df.describe())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>size</th>\n",
- " <th>len</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>264</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>187</th>\n",
- " <td>5</td>\n",
- " <td>1</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>276</th>\n",
- " <td>28</td>\n",
- " <td>11</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>284</th>\n",
- " <td>7</td>\n",
- " <td>6</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>288</th>\n",
- " <td>6</td>\n",
- " <td>6</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>...</th>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5818054</th>\n",
- " <td>7</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5818060</th>\n",
- " <td>6</td>\n",
- " <td>4</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5818104</th>\n",
- " <td>6</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5818118</th>\n",
- " <td>5</td>\n",
- " <td>4</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5818128</th>\n",
- " <td>13</td>\n",
- " <td>4</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>448170 rows × 2 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- " size len\n",
- "0 264 0\n",
- "187 5 1\n",
- "276 28 11\n",
- "284 7 6\n",
- "288 6 6\n",
- "... ... ...\n",
- "5818054 7 2\n",
- "5818060 6 4\n",
- "5818104 6 2\n",
- "5818118 5 4\n",
- "5818128 13 4\n",
- "\n",
- "[448170 rows x 2 columns]"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df[\"size\"] > 4]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>size</th>\n",
- " <th>len</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>264</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>276</th>\n",
- " <td>28</td>\n",
- " <td>11</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>314</th>\n",
- " <td>195</td>\n",
- " <td>15</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>329</th>\n",
- " <td>10</td>\n",
- " <td>14</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>364</th>\n",
- " <td>98</td>\n",
- " <td>15</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>...</th>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5817734</th>\n",
- " <td>18</td>\n",
- " <td>5</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5817835</th>\n",
- " <td>11</td>\n",
- " <td>4</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5817886</th>\n",
- " <td>20</td>\n",
- " <td>5</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5817901</th>\n",
- " <td>15</td>\n",
- " <td>10</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5818128</th>\n",
- " <td>13</td>\n",
- " <td>4</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>159500 rows × 2 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- " size len\n",
- "0 264 0\n",
- "276 28 11\n",
- "314 195 15\n",
- "329 10 14\n",
- "364 98 15\n",
- "... ... ...\n",
- "5817734 18 5\n",
- "5817835 11 4\n",
- "5817886 20 5\n",
- "5817901 15 10\n",
- "5818128 13 4\n",
- "\n",
- "[159500 rows x 2 columns]"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df[\"size\"] >= 10]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>size</th>\n",
- " <th>len</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>264</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>314</th>\n",
- " <td>195</td>\n",
- " <td>15</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>428</th>\n",
- " <td>122</td>\n",
- " <td>31</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>525</th>\n",
- " <td>173</td>\n",
- " <td>28</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>727</th>\n",
- " <td>270</td>\n",
- " <td>31</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>...</th>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5816100</th>\n",
- " <td>147</td>\n",
- " <td>4</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5817345</th>\n",
- " <td>167</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5817361</th>\n",
- " <td>258</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5817366</th>\n",
- " <td>298</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5817374</th>\n",
- " <td>252</td>\n",
- " <td>2</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>9610 rows × 2 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- " size len\n",
- "0 264 0\n",
- "314 195 15\n",
- "428 122 31\n",
- "525 173 28\n",
- "727 270 31\n",
- "... ... ...\n",
- "5816100 147 4\n",
- "5817345 167 2\n",
- "5817361 258 2\n",
- "5817366 298 2\n",
- "5817374 252 2\n",
- "\n",
- "[9610 rows x 2 columns]"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df[\"size\"] > 100]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "5818143"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(df)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "fuzzycat",
- "language": "python",
- "name": "fuzzycat"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}