diff options
Diffstat (limited to 'notebooks/Cluster Size and Title Length.ipynb')
-rw-r--r-- | notebooks/Cluster Size and Title Length.ipynb | 553 |
1 files changed, 0 insertions, 553 deletions
diff --git a/notebooks/Cluster Size and Title Length.ipynb b/notebooks/Cluster Size and Title Length.ipynb deleted file mode 100644 index b78ba8b..0000000 --- a/notebooks/Cluster Size and Title Length.ipynb +++ /dev/null @@ -1,553 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv(\"../fixtures/cluster_title_normalized_dups_size_keylen.tsv\", sep=\"\\t\", names=[\"size\", \"len\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5818143" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>size</th>\n", - " <th>len</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>264</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>2</td>\n", - " <td>3</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>2</td>\n", - " <td>3</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>2</td>\n", - " <td>3</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>2</td>\n", - " <td>4</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " size len\n", - "0 264 0\n", - "1 2 3\n", - "2 2 3\n", - "3 2 3\n", - "4 2 4" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option('display.float_format', lambda x: '%.3f' % x)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " size len\n", - "count 5818143.000 5818143.000\n", - "mean 4.350 52.120\n", - "std 196.347 35.026\n", - "min 2.000 0.000\n", - "25% 2.000 24.000\n", - "50% 2.000 46.000\n", - "75% 3.000 72.000\n", - "max 151383.000 11686.000\n" - ] - } - ], - "source": [ - "print(df.describe())" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>size</th>\n", - " <th>len</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>264</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>187</th>\n", - " <td>5</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <th>276</th>\n", - " <td>28</td>\n", - " <td>11</td>\n", - " </tr>\n", - " <tr>\n", - " <th>284</th>\n", - " <td>7</td>\n", - " <td>6</td>\n", - " </tr>\n", - " <tr>\n", - " <th>288</th>\n", - " <td>6</td>\n", - " <td>6</td>\n", - " </tr>\n", - " <tr>\n", - " <th>...</th>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5818054</th>\n", - " <td>7</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5818060</th>\n", - " <td>6</td>\n", - " <td>4</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5818104</th>\n", - " <td>6</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5818118</th>\n", - " <td>5</td>\n", - " <td>4</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5818128</th>\n", - " <td>13</td>\n", - " <td>4</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "<p>448170 rows × 2 columns</p>\n", - "</div>" - ], - "text/plain": [ - " size len\n", - "0 264 0\n", - "187 5 1\n", - "276 28 11\n", - "284 7 6\n", - "288 6 6\n", - "... ... ...\n", - "5818054 7 2\n", - "5818060 6 4\n", - "5818104 6 2\n", - "5818118 5 4\n", - "5818128 13 4\n", - "\n", - "[448170 rows x 2 columns]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[\"size\"] > 4]" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>size</th>\n", - " <th>len</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>264</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>276</th>\n", - " <td>28</td>\n", - " <td>11</td>\n", - " </tr>\n", - " <tr>\n", - " <th>314</th>\n", - " <td>195</td>\n", - " <td>15</td>\n", - " </tr>\n", - " <tr>\n", - " <th>329</th>\n", - " <td>10</td>\n", - " <td>14</td>\n", - " </tr>\n", - " <tr>\n", - " <th>364</th>\n", - " <td>98</td>\n", - " <td>15</td>\n", - " </tr>\n", - " <tr>\n", - " <th>...</th>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5817734</th>\n", - " <td>18</td>\n", - " <td>5</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5817835</th>\n", - " <td>11</td>\n", - " <td>4</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5817886</th>\n", - " <td>20</td>\n", - " <td>5</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5817901</th>\n", - " <td>15</td>\n", - " <td>10</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5818128</th>\n", - " <td>13</td>\n", - " <td>4</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "<p>159500 rows × 2 columns</p>\n", - "</div>" - ], - "text/plain": [ - " size len\n", - "0 264 0\n", - "276 28 11\n", - "314 195 15\n", - "329 10 14\n", - "364 98 15\n", - "... ... ...\n", - "5817734 18 5\n", - "5817835 11 4\n", - "5817886 20 5\n", - "5817901 15 10\n", - "5818128 13 4\n", - "\n", - "[159500 rows x 2 columns]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[\"size\"] >= 10]" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>size</th>\n", - " <th>len</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>264</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>314</th>\n", - " <td>195</td>\n", - " <td>15</td>\n", - " </tr>\n", - " <tr>\n", - " <th>428</th>\n", - " <td>122</td>\n", - " <td>31</td>\n", - " </tr>\n", - " <tr>\n", - " <th>525</th>\n", - " <td>173</td>\n", - " <td>28</td>\n", - " </tr>\n", - " <tr>\n", - " <th>727</th>\n", - " <td>270</td>\n", - " <td>31</td>\n", - " </tr>\n", - " <tr>\n", - " <th>...</th>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5816100</th>\n", - " <td>147</td>\n", - " <td>4</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5817345</th>\n", - " <td>167</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5817361</th>\n", - " <td>258</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5817366</th>\n", - " <td>298</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5817374</th>\n", - " <td>252</td>\n", - " <td>2</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "<p>9610 rows × 2 columns</p>\n", - "</div>" - ], - "text/plain": [ - " size len\n", - "0 264 0\n", - "314 195 15\n", - "428 122 31\n", - "525 173 28\n", - "727 270 31\n", - "... ... ...\n", - "5816100 147 4\n", - "5817345 167 2\n", - "5817361 258 2\n", - "5817366 298 2\n", - "5817374 252 2\n", - "\n", - "[9610 rows x 2 columns]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[\"size\"] > 100]" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5818143" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "fuzzycat", - "language": "python", - "name": "fuzzycat" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} |