From 2b216f17fccf6ff90b41ca972bf1730078cc6180 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 22 Oct 2020 20:15:46 +0200 Subject: update notes on cluster, nb --- notebooks/Cluster Size and Title Length.ipynb | 553 ++++++++++++++++++++++++++ 1 file changed, 553 insertions(+) create mode 100644 notebooks/Cluster Size and Title Length.ipynb (limited to 'notebooks') diff --git a/notebooks/Cluster Size and Title Length.ipynb b/notebooks/Cluster Size and Title Length.ipynb new file mode 100644 index 0000000..b78ba8b --- /dev/null +++ b/notebooks/Cluster Size and Title Length.ipynb @@ -0,0 +1,553 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"../fixtures/cluster_title_normalized_dups_size_keylen.tsv\", sep=\"\\t\", names=[\"size\", \"len\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5818143" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sizelen
02640
123
223
323
424
\n", + "
" + ], + "text/plain": [ + " size len\n", + "0 264 0\n", + "1 2 3\n", + "2 2 3\n", + "3 2 3\n", + "4 2 4" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.float_format', lambda x: '%.3f' % x)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " size len\n", + "count 5818143.000 5818143.000\n", + "mean 4.350 52.120\n", + "std 196.347 35.026\n", + "min 2.000 0.000\n", + "25% 2.000 24.000\n", + "50% 2.000 46.000\n", + "75% 3.000 72.000\n", + "max 151383.000 11686.000\n" + ] + } + ], + "source": [ + "print(df.describe())" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sizelen
02640
18751
2762811
28476
28866
.........
581805472
581806064
581810462
581811854
5818128134
\n", + "

448170 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " size len\n", + "0 264 0\n", + "187 5 1\n", + "276 28 11\n", + "284 7 6\n", + "288 6 6\n", + "... ... ...\n", + "5818054 7 2\n", + "5818060 6 4\n", + "5818104 6 2\n", + "5818118 5 4\n", + "5818128 13 4\n", + "\n", + "[448170 rows x 2 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df[\"size\"] > 4]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sizelen
02640
2762811
31419515
3291014
3649815
.........
5817734185
5817835114
5817886205
58179011510
5818128134
\n", + "

159500 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " size len\n", + "0 264 0\n", + "276 28 11\n", + "314 195 15\n", + "329 10 14\n", + "364 98 15\n", + "... ... ...\n", + "5817734 18 5\n", + "5817835 11 4\n", + "5817886 20 5\n", + "5817901 15 10\n", + "5818128 13 4\n", + "\n", + "[159500 rows x 2 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df[\"size\"] >= 10]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sizelen
02640
31419515
42812231
52517328
72727031
.........
58161001474
58173451672
58173612582
58173662982
58173742522
\n", + "

9610 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " size len\n", + "0 264 0\n", + "314 195 15\n", + "428 122 31\n", + "525 173 28\n", + "727 270 31\n", + "... ... ...\n", + "5816100 147 4\n", + "5817345 167 2\n", + "5817361 258 2\n", + "5817366 298 2\n", + "5817374 252 2\n", + "\n", + "[9610 rows x 2 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df[\"size\"] > 100]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5818143" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fuzzycat", + "language": "python", + "name": "fuzzycat" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} -- cgit v1.2.3