From 621f50e685d9beeb1fe502a133e76fbd5a8a9c5c Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 24 Nov 2020 15:06:34 +0100 Subject: cleanup --- notebooks/Cluster Size and Title Length.ipynb | 553 -------------------------- 1 file changed, 553 deletions(-) delete mode 100644 notebooks/Cluster Size and Title Length.ipynb (limited to 'notebooks/Cluster Size and Title Length.ipynb') diff --git a/notebooks/Cluster Size and Title Length.ipynb b/notebooks/Cluster Size and Title Length.ipynb deleted file mode 100644 index b78ba8b..0000000 --- a/notebooks/Cluster Size and Title Length.ipynb +++ /dev/null @@ -1,553 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv(\"../fixtures/cluster_title_normalized_dups_size_keylen.tsv\", sep=\"\\t\", names=[\"size\", \"len\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5818143" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sizelen
02640
123
223
323
424
\n", - "
" - ], - "text/plain": [ - " size len\n", - "0 264 0\n", - "1 2 3\n", - "2 2 3\n", - "3 2 3\n", - "4 2 4" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option('display.float_format', lambda x: '%.3f' % x)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " size len\n", - "count 5818143.000 5818143.000\n", - "mean 4.350 52.120\n", - "std 196.347 35.026\n", - "min 2.000 0.000\n", - "25% 2.000 24.000\n", - "50% 2.000 46.000\n", - "75% 3.000 72.000\n", - "max 151383.000 11686.000\n" - ] - } - ], - "source": [ - "print(df.describe())" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sizelen
02640
18751
2762811
28476
28866
.........
581805472
581806064
581810462
581811854
5818128134
\n", - "

448170 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " size len\n", - "0 264 0\n", - "187 5 1\n", - "276 28 11\n", - "284 7 6\n", - "288 6 6\n", - "... ... ...\n", - "5818054 7 2\n", - "5818060 6 4\n", - "5818104 6 2\n", - "5818118 5 4\n", - "5818128 13 4\n", - "\n", - "[448170 rows x 2 columns]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[\"size\"] > 4]" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sizelen
02640
2762811
31419515
3291014
3649815
.........
5817734185
5817835114
5817886205
58179011510
5818128134
\n", - "

159500 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " size len\n", - "0 264 0\n", - "276 28 11\n", - "314 195 15\n", - "329 10 14\n", - "364 98 15\n", - "... ... ...\n", - "5817734 18 5\n", - "5817835 11 4\n", - "5817886 20 5\n", - "5817901 15 10\n", - "5818128 13 4\n", - "\n", - "[159500 rows x 2 columns]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[\"size\"] >= 10]" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sizelen
02640
31419515
42812231
52517328
72727031
.........
58161001474
58173451672
58173612582
58173662982
58173742522
\n", - "

9610 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " size len\n", - "0 264 0\n", - "314 195 15\n", - "428 122 31\n", - "525 173 28\n", - "727 270 31\n", - "... ... ...\n", - "5816100 147 4\n", - "5817345 167 2\n", - "5817361 258 2\n", - "5817366 298 2\n", - "5817374 252 2\n", - "\n", - "[9610 rows x 2 columns]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[\"size\"] > 100]" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5818143" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "fuzzycat", - "language": "python", - "name": "fuzzycat" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} -- cgit v1.2.3