From 621f50e685d9beeb1fe502a133e76fbd5a8a9c5c Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 24 Nov 2020 15:06:34 +0100 Subject: cleanup --- .mypy.ini | 3 - .pylintrc | 3 - Makefile | 1 + notebooks/Cluster Size and Title Length.ipynb | 553 - notebooks/Journal_Names.html | 14635 ------------------------ notebooks/Journal_Names.ipynb | 2032 ---- notebooks/Journal_Names.pdf | Bin 69255 -> 0 bytes notebooks/Journal_Names.zip | Bin 26734 -> 0 bytes notes/bm.md | 19 - notes/clustering.md | 102 - notes/general.md | 197 - notes/todo.md | 23 - notes/workflow.md | 60 - pyproject.toml | 3 - setup.py | 50 - 15 files changed, 1 insertion(+), 17680 deletions(-) delete mode 100644 .mypy.ini delete mode 100644 .pylintrc delete mode 100644 notebooks/Cluster Size and Title Length.ipynb delete mode 100644 notebooks/Journal_Names.html delete mode 100644 notebooks/Journal_Names.ipynb delete mode 100644 notebooks/Journal_Names.pdf delete mode 100644 notebooks/Journal_Names.zip delete mode 100644 notes/bm.md delete mode 100644 notes/clustering.md delete mode 100644 notes/general.md delete mode 100644 notes/todo.md delete mode 100644 notes/workflow.md delete mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/.mypy.ini b/.mypy.ini deleted file mode 100644 index ebcf395..0000000 --- a/.mypy.ini +++ /dev/null @@ -1,3 +0,0 @@ -[mypy] -ignore_missing_imports = True - diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index 72e94cb..0000000 --- a/.pylintrc +++ /dev/null @@ -1,3 +0,0 @@ -[MESSAGES CONTROL] - -disable=C0301 diff --git a/Makefile b/Makefile index ff7cb30..752fad3 100644 --- a/Makefile +++ b/Makefile @@ -49,6 +49,7 @@ clean: ## Clean all artifacts rm -rf .pytest_cache/ rm -rf .coverage rm -rf .mypy_cache/ + find . -name "__pycache__" -type d -exec rm -rf {} \; # Upload requires https://github.com/pypa/twine and some configuration. .PHONY: upload diff --git a/notebooks/Cluster Size and Title Length.ipynb b/notebooks/Cluster Size and Title Length.ipynb deleted file mode 100644 index b78ba8b..0000000 --- a/notebooks/Cluster Size and Title Length.ipynb +++ /dev/null @@ -1,553 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv(\"../fixtures/cluster_title_normalized_dups_size_keylen.tsv\", sep=\"\\t\", names=[\"size\", \"len\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5818143" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sizelen
02640
123
223
323
424
\n", - "
" - ], - "text/plain": [ - " size len\n", - "0 264 0\n", - "1 2 3\n", - "2 2 3\n", - "3 2 3\n", - "4 2 4" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option('display.float_format', lambda x: '%.3f' % x)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " size len\n", - "count 5818143.000 5818143.000\n", - "mean 4.350 52.120\n", - "std 196.347 35.026\n", - "min 2.000 0.000\n", - "25% 2.000 24.000\n", - "50% 2.000 46.000\n", - "75% 3.000 72.000\n", - "max 151383.000 11686.000\n" - ] - } - ], - "source": [ - "print(df.describe())" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sizelen
02640
18751
2762811
28476
28866
.........
581805472
581806064
581810462
581811854
5818128134
\n", - "

448170 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " size len\n", - "0 264 0\n", - "187 5 1\n", - "276 28 11\n", - "284 7 6\n", - "288 6 6\n", - "... ... ...\n", - "5818054 7 2\n", - "5818060 6 4\n", - "5818104 6 2\n", - "5818118 5 4\n", - "5818128 13 4\n", - "\n", - "[448170 rows x 2 columns]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[\"size\"] > 4]" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sizelen
02640
2762811
31419515
3291014
3649815
.........
5817734185
5817835114
5817886205
58179011510
5818128134
\n", - "

159500 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " size len\n", - "0 264 0\n", - "276 28 11\n", - "314 195 15\n", - "329 10 14\n", - "364 98 15\n", - "... ... ...\n", - "5817734 18 5\n", - "5817835 11 4\n", - "5817886 20 5\n", - "5817901 15 10\n", - "5818128 13 4\n", - "\n", - "[159500 rows x 2 columns]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[\"size\"] >= 10]" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sizelen
02640
31419515
42812231
52517328
72727031
.........
58161001474
58173451672
58173612582
58173662982
58173742522
\n", - "

9610 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " size len\n", - "0 264 0\n", - "314 195 15\n", - "428 122 31\n", - "525 173 28\n", - "727 270 31\n", - "... ... ...\n", - "5816100 147 4\n", - "5817345 167 2\n", - "5817361 258 2\n", - "5817366 298 2\n", - "5817374 252 2\n", - "\n", - "[9610 rows x 2 columns]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df[\"size\"] > 100]" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5818143" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "fuzzycat", - "language": "python", - "name": "fuzzycat" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/Journal_Names.html b/notebooks/Journal_Names.html deleted file mode 100644 index a87394a..0000000 --- a/notebooks/Journal_Names.html +++ /dev/null @@ -1,14635 +0,0 @@ - - - - -Journal_Names - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
-

Journal Names

Questions in the context of fuzzy matching.

-
    -
  • How many journal names appear more than once?
  • -
  • What is the average length of the duplicated names vs the unique names?
  • -
-

Input file is a single larger JSON, mapping names to issns.

-
{
-  "Acta Orientalia.": [
-    "0001-6438"
-  ],
-  "Acta Orientalia (København)": [
-    "0001-6438"
-  ],
-..
-
- -
-
-
-
-
-
In [4]:
-
-
-
import json
-import pandas as pd
-
- -
-
-
- -
-
-
-
In [5]:
-
-
-
with open("../data/name_to_issn.json") as f:
-    mapping = json.load(f)
-
- -
-
-
- -
-
-
-
-

We have about 3M keys.

- -
-
-
-
-
-
In [7]:
-
-
-
len(mapping)
-
- -
-
-
- -
-
- - -
- -
Out[7]:
- - - - -
-
2929727
-
- -
- -
-
- -
-
-
-
In [21]:
-
-
-
df = pd.DataFrame(((k, len(v)) for k, v in mapping.items()), columns=["name", "issn_count"])
-
- -
-
-
- -
-
-
-
In [25]:
-
-
-
len(df)
-
- -
-
-
- -
-
- - -
- -
Out[25]:
- - - - -
-
2929727
-
- -
- -
-
- -
-
-
-
In [26]:
-
-
-
df.head()
-
- -
-
-
- -
-
- - -
- -
Out[26]:
- - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameissn_count
0Acta Orientalia.1
1Acta Orientalia (København)1
2˜The œpublishers weekly.1
3Publishers weekly1
4ASMT news1
-
-
- -
- -
-
- -
-
-
-
In [31]:
-
-
-
unique_name = df[df.issn_count == 1]
-
- -
-
-
- -
-
-
-
In [32]:
-
-
-
repeated_names = df[df.issn_count > 1]
-
- -
-
-
- -
-
-
-
In [34]:
-
-
-
len(repeated_names)
-
- -
-
-
- -
-
- - -
- -
Out[34]:
- - - - -
-
194241
-
- -
- -
-
- -
-
-
-
In [33]:
-
-
-
len(repeated_names) / len(df)
-
- -
-
-
- -
-
- - -
- -
Out[33]:
- - - - -
-
0.06630003409874026
-
- -
- -
-
- -
-
-
-
-

About 6% (or 194241) names are repeated.

- -
-
-
-
-
-
In [35]:
-
-
-
repeated_names.describe()
-
- -
-
-
- -
-
- - -
- -
Out[35]:
- - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
issn_count
count194241.000000
mean3.197523
std25.081605
min2.000000
25%2.000000
50%2.000000
75%2.000000
max8980.000000
-
-
- -
- -
-
- -
-
-
-
-

Which name is shared by over 8000 ISSN?

- -
-
-
-
-
-
In [40]:
-
-
-
repeated_names.iloc[repeated_names.issn_count.argmax()] # Annual report.
-
- -
-
-
- -
-
- - -
- -
Out[40]:
- - - - -
-
name          Annual report.
-issn_count              8980
-Name: 45907, dtype: object
-
- -
- -
-
- -
-
-
-
-

It is the "Annual report."

- -
-
-
-
-
-
In [42]:
-
-
-
mapping["Annual report."][:10]
-
- -
-
-
- -
-
- - -
- -
Out[42]:
- - - - -
-
['0706-537X',
- '1186-7957',
- '2324-1926',
- '1445-9248',
- '0872-3982',
- '1714-1524',
- '1037-8812',
- '0225-0241',
- '1327-6344',
- '0702-7702']
-
- -
- -
-
- -
-
-
-
-

On average a repeated name will point to 3 ISSN. About 24k names point to more than 3 ISSN.

- -
-
-
-
-
-
In [45]:
-
-
-
len(repeated_names[repeated_names.issn_count > 3])
-
- -
-
-
- -
-
- - -
- -
Out[45]:
- - - - -
-
24107
-
- -
- -
-
- -
-
-
-
In [49]:
-
-
-
repeated_names[repeated_names.issn_count > 3].sample(n=10)
-
- -
-
-
- -
-
- - -
- -
Out[49]:
- - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameissn_count
322100Philosophica.17
183928Edad de oro.4
294309Horoskop.10
517039˜Le œGrand journal.11
1664616Caleidoscop şcolar.4
258430˜La œFeuille.34
309546˜The œWilson quarterly.4
795859Introductory research essay4
1470838Publicaciones del SEMYR.4
657041˜Le œKiosque.14
-
-
- -
- -
-
- -
-
-
-
In [50]:
-
-
-
mapping["Philosophica."]
-
- -
-
-
- -
-
- - -
- -
Out[50]:
- - - - -
-
['1285-9133',
- '1480-4670',
- '1487-5349',
- '1724-6598',
- '2183-0134',
- '2538-693X',
- '2610-8933',
- '2035-8326',
- '2295-9084',
- '1517-8889',
- '2249-5053',
- '2420-9198',
- '2654-9263',
- '2610-8925',
- '1158-9574',
- '0872-4784',
- '0379-8402']
-
- -
- -
-
- -
-
-
-
In [61]:
-
-
-
repeated_names[repeated_names.issn_count > 3].issn_count.hist(bins=20)
-
- -
-
-
- -
-
- - -
- -
Out[61]:
- - - - -
-
<AxesSubplot:>
-
- -
- -
- -
- - - - -
- -
- -
- -
-
- -
-
-
-
In [60]:
-
-
-
repeated_names[(repeated_names.issn_count > 3) & (repeated_names.issn_count < 50)].issn_count.hist(bins=10)
-
- -
-
-
- -
-
- - -
- -
Out[60]:
- - - - -
-
<AxesSubplot:>
-
- -
- -
- -
- - - - -
- -
- -
- -
-
- -
-
-
-
In [62]:
-
-
-
repeated_names[(repeated_names.issn_count > 3) & (repeated_names.issn_count < 20)].issn_count.hist(bins=10)
-
- -
-
-
- -
-
- - -
- -
Out[62]:
- - - - -
-
<AxesSubplot:>
-
- -
- -
- -
- - - - -
- -
- -
- -
-
- -
-
-
-
In [64]:
-
-
-
repeated_names[(repeated_names.issn_count > 3) & (repeated_names.issn_count < 8)].issn_count.hist()
-
- -
-
-
- -
-
- - -
- -
Out[64]:
- - - - -
-
<AxesSubplot:>
-
- -
- -
- -
- - - - -
- -
- -
- -
-
- -
-
-
-
In [70]:
-
-
-
repeated_names[repeated_names.issn_count > 1000].issn_count.hist(bins=10)
-
- -
-
-
- -
-
- - -
- -
Out[70]:
- - - - -
-
<AxesSubplot:>
-
- -
- -
- -
- - - - -
- -
- -
- -
-
- -
-
-
-
In [71]:
-
-
-
repeated_names[repeated_names.issn_count > 1000]
-
- -
-
-
- -
-
- - -
- -
Out[71]:
- - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameissn_count
3499Bulletin.2752
7632Newsletter.2715
8317Rapport.1050
23662Proceedings.1403
45839Annual report /1090
45907Annual report.8980
45964Annuaire.1260
47217Rapport annuel.2656
-
-
- -
- -
-
- -
-
-
-
In [72]:
-
-
-
repeated_names[repeated_names.issn_count > 500]
-
- -
-
-
- -
-
- - -
- -
Out[72]:
- - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameissn_count
102Bulletin d'information.693
3218Bulletin de liaison.510
3499Bulletin.2752
7632Newsletter.2715
8317Rapport.1050
23662Proceedings.1403
45794Report.743
45839Annual report /1090
45907Annual report.8980
45964Annuaire.1260
46370Jaarverslag.675
47142Rapport d'activité.660
47217Rapport annuel.2656
49289Jahresbericht.518
57558Annual report760
121599Alumni directory /511
128827Bulletin municipal.521
150529˜La œLettre.623
168933Local climatological data.613
269004Estimates.535
-
-
- -
- -
-
- -
-
-
-
In [75]:
-
-
-
repeated_names[repeated_names.issn_count > 200]
-
- -
-
-
- -
-
- - -
- -
Out[75]:
- - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameissn_count
102Bulletin d'information.693
2665Newsletter /259
3218Bulletin de liaison.510
3499Bulletin.2752
3926Boletín.216
.........
425644Rapport d'activité ...394
532500Relatório e contas.247
603144Bildung und Beruf regional.292
1006131Vies de famille.222
1110247Country risk service.271
-

66 rows × 2 columns

-
-
- -
- -
-
- -
-
-
-
In [76]:
-
-
-
repeated_names[repeated_names.issn_count > 100]
-
- -
-
-
- -
-
- - -
- -
Out[76]:
- - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameissn_count
102Bulletin d'information.693
2665Newsletter /259
3218Bulletin de liaison.510
3499Bulletin.2752
3926Boletín.216
.........
1306798Country commerce.120
1318569Bible studies for life.159
1796742LexisNexis practice guide.101
2628387Operational risk report.119
2650557Interempresas net.108
-

191 rows × 2 columns

-
-
- -
- -
-
- -
-
-
-
In [82]:
-
-
-
repeated_names
-
- -
-
-
- -
-
- - -
- -
Out[82]:
- - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
nameissn_count
5Activitas Nervosa Superior.2
11Library journal.2
23Acta cardiologica.2
26Actualidad económica.3
31Acta Ornithologica.3
.........
2929626Modern machine shop México.2
2929635Lecture notes in control and information scien...2
2929646Critical Studies in Dance Leadership and Inclu...2
2929691Nigerian Journal of Wildlife Management2
2929702Verzeichniss der Kunstwerke lebender Künstler,...2
-

194241 rows × 2 columns

-
-
- -
- -
-
- -
-
-
-
-

If a name matches a repeated name exactly or fuzzy matches to a repeated name and there is not other information available, the match status must be ambigious.

- -
-
-
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
- - - - - - diff --git a/notebooks/Journal_Names.ipynb b/notebooks/Journal_Names.ipynb deleted file mode 100644 index 91da2a2..0000000 --- a/notebooks/Journal_Names.ipynb +++ /dev/null @@ -1,2032 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Journal Names\n", - "\n", - "Questions in the context of fuzzy matching.\n", - "\n", - "* How many journal names appear more than once?\n", - "* What is the average length of the duplicated names vs the unique names?\n", - "\n", - "\n", - "Input file is a single larger JSON, mapping names to issns.\n", - "```json\n", - "{\n", - " \"Acta Orientalia.\": [\n", - " \"0001-6438\"\n", - " ],\n", - " \"Acta Orientalia (København)\": [\n", - " \"0001-6438\"\n", - " ],\n", - "..\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [], - "source": [ - "with open(\"../data/name_to_issn.json\") as f:\n", - " mapping = json.load(f)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We have about 3M keys." - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2938859" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(mapping)" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(((k, len(v)) for k, v in mapping.items()), columns=[\"name\", \"issn_count\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2938859" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameissn_count
0Acta Orientalia.1
1Acta Orientalia (København)1
2˜The œpublishers weekly.2
3Publishers weekly2
4ASMT news1
\n", - "
" - ], - "text/plain": [ - " name issn_count\n", - "0 Acta Orientalia. 1\n", - "1 Acta Orientalia (København) 1\n", - "2 ˜The œpublishers weekly. 2\n", - "3 Publishers weekly 2\n", - "4 ASMT news 1" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [], - "source": [ - "unique_name = df[df.issn_count == 1]" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [], - "source": [ - "repeated_names = df[df.issn_count > 1]" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "586466" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(repeated_names)" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.1995556779008452" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(repeated_names) / len(df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "About 19% (or 586466) names are repeated. " - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
issn_count
count586466.000000
mean2.443930
std15.260303
min2.000000
25%2.000000
50%2.000000
75%2.000000
max9520.000000
\n", - "
" - ], - "text/plain": [ - " issn_count\n", - "count 586466.000000\n", - "mean 2.443930\n", - "std 15.260303\n", - "min 2.000000\n", - "25% 2.000000\n", - "50% 2.000000\n", - "75% 2.000000\n", - "max 9520.000000" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "repeated_names.describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Which name is shared most?" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "name Annual report.\n", - "issn_count 9520\n", - "Name: 45999, dtype: object" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "repeated_names.iloc[repeated_names.issn_count.argmax()] # Annual report." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It is the \"Annual report.\"" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['1495-9097',\n", - " '2205-7919',\n", - " '1447-3836',\n", - " '1914-9220',\n", - " '0707-1515',\n", - " '1494-6149',\n", - " '2293-3174',\n", - " '1977-9046',\n", - " '0707-4298',\n", - " '2239-9674']" - ] - }, - "execution_count": 74, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mapping[\"Annual report.\"][:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "On average a repeated name will point to 3 ISSN. Over 20k names point to more than 3 ISSN." - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "27230" - ] - }, - "execution_count": 76, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(repeated_names[repeated_names.issn_count > 3])" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameissn_count
1356987Karrier online6
1778072Curriculum express for parents.4
366690Annual Report ....7
878789Brilliant.4
37507Synthesis.35
942931Special Burda11
640073Belle.6
2666134Naše delo6
376646Opinion.12
918869Limited edition magazine.16
\n", - "
" - ], - "text/plain": [ - " name issn_count\n", - "1356987 Karrier online 6\n", - "1778072 Curriculum express for parents. 4\n", - "366690 Annual Report .... 7\n", - "878789 Brilliant. 4\n", - "37507 Synthesis. 35\n", - "942931 Special Burda 11\n", - "640073 Belle. 6\n", - "2666134 Naše delo 6\n", - "376646 Opinion. 12\n", - "918869 Limited edition magazine. 16" - ] - }, - "execution_count": 77, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "repeated_names[repeated_names.issn_count > 3].sample(n=10)" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['1158-9574',\n", - " '2183-0134',\n", - " '1517-8889',\n", - " '1480-4670',\n", - " '0379-8402',\n", - " '1724-6598',\n", - " '2035-8326',\n", - " '2249-5053',\n", - " '0872-4784',\n", - " '2610-8933',\n", - " '2610-8925',\n", - " '2654-9263',\n", - " '2420-9198',\n", - " '2538-693X',\n", - " '1487-5349',\n", - " '1285-9133',\n", - " '2655-5662',\n", - " '2295-9084']" - ] - }, - "execution_count": 78, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mapping[\"Philosophica.\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 79, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD4CAYAAAAO9oqkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAASEklEQVR4nO3df6zddX3H8edrrWDFHxTZbljbrDU2S6pkgDfYRbPcyVIKLismxECIVGTWTEh0I5lF/8CJJLIM3SCKq9JZFhQY4tq4uq5jnBj/AAFllB9ir1BHmwJK+WExkdW998f5VI/lXnp7envv7T3PR3Jyvt/39/P98T7fe/u653u+9zZVhSRpsP3WdB+AJGn6GQaSJMNAkmQYSJIwDCRJwNzpPoB+nXjiibV48eK+1n3xxRc57rjjJveAjiL2b//2P7j933fffT+tqt8+sH7UhsHixYu59957+1q30+kwMjIyuQd0FLF/+7f/kek+jGmT5Mdj1b1MJEkyDCRJhoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkjuLfQD4c23Y9z/vX/ltf6+74zLsn+Wgkafr5zkCSZBhIkgwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkiQmEQZJFSe5M8nCSh5J8pNU/mWRXkvvb4+yedS5PMprk0SRn9tRXttpokrU99SVJ7m71W5IcM9mNSpLGN5F3BvuAy6pqGbAcuCTJsrbsc1V1SntsBmjLzgPeAqwEvpBkTpI5wOeBs4BlwPk927m6bevNwLPAxZPUnyRpAg4aBlW1u6q+16Z/BjwCLHiFVVYBN1fVL6rqcWAUOL09Rqvqsap6CbgZWJUkwLuA29r6G4Bz+uxHktSHQ/rMIMli4FTg7la6NMkDSdYnmd9qC4Anelbb2Wrj1d8IPFdV+w6oS5KmyIT/c5skrwW+Dny0ql5Icj1wJVDt+RrgA0fkKH99DGuANQBDQ0N0Op2+tjM0Dy47ed/BB46h333OJHv37p0VffTL/u1/kPsfz4TCIMmr6AbBTVV1O0BVPdWz/EvAN9vsLmBRz+oLW41x6s8AxyeZ294d9I7/DVW1DlgHMDw8XCMjIxM5/Je57qaNXLOtv//kbccF/e1zJul0OvT72s0G9m//g9z/eCZyN1GAG4BHquqzPfWTeoa9B3iwTW8CzktybJIlwFLgu8A9wNJ259AxdD9k3lRVBdwJnNvWXw1sPLy2JEmHYiI/Hr8DeB+wLcn9rfZxuncDnUL3MtEO4EMAVfVQkluBh+neiXRJVf0SIMmlwBZgDrC+qh5q2/sYcHOSTwPfpxs+kqQpctAwqKrvABlj0eZXWOcq4Kox6pvHWq+qHqN7t5EkaRr4G8iSJMNAkmQYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJCYQBkkWJbkzycNJHkrykVY/IcnWJNvb8/xWT5Jrk4wmeSDJaT3bWt3Gb0+yuqf+tiTb2jrXJsmRaFaSNLaJvDPYB1xWVcuA5cAlSZYBa4E7qmopcEebBzgLWNoea4DroRsewBXA24HTgSv2B0gb88Ge9VYefmuSpIk6aBhU1e6q+l6b/hnwCLAAWAVsaMM2AOe06VXAjdV1F3B8kpOAM4GtVbWnqp4FtgIr27LXV9VdVVXAjT3bkiRNgUP6zCDJYuBU4G5gqKp2t0VPAkNtegHwRM9qO1vtleo7x6hLkqbI3IkOTPJa4OvAR6vqhd7L+lVVSeoIHN+Bx7CG7qUnhoaG6HQ6fW1naB5cdvK+vtbtd58zyd69e2dFH/2yf/sf5P7HM6EwSPIqukFwU1Xd3spPJTmpqna3Sz1Pt/ouYFHP6gtbbRcwckC90+oLxxj/MlW1DlgHMDw8XCMjI2MNO6jrbtrINdsmnIO/YccF/e1zJul0OvT72s0G9m//g9z/eCZyN1GAG4BHquqzPYs2AfvvCFoNbOypX9juKloOPN8uJ20BViSZ3z44XgFsacteSLK87evCnm1JkqbARH48fgfwPmBbkvtb7ePAZ4Bbk1wM/Bh4b1u2GTgbGAV+DlwEUFV7klwJ3NPGfaqq9rTpDwNfAeYB32oPSdIUOWgYVNV3gPHu+z9jjPEFXDLOttYD68eo3wu89WDHIkk6MvwNZEmSYSBJMgwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kSEwiDJOuTPJ3kwZ7aJ5PsSnJ/e5zds+zyJKNJHk1yZk99ZauNJlnbU1+S5O5WvyXJMZPZoCTp4CbyzuArwMox6p+rqlPaYzNAkmXAecBb2jpfSDInyRzg88BZwDLg/DYW4Oq2rTcDzwIXH05DkqRDd9AwqKpvA3smuL1VwM1V9YuqehwYBU5vj9GqeqyqXgJuBlYlCfAu4La2/gbgnENrQZJ0uA7nM4NLkzzQLiPNb7UFwBM9Y3a22nj1NwLPVdW+A+qSpCk0t8/1rgeuBKo9XwN8YLIOajxJ1gBrAIaGhuh0On1tZ2geXHbyvoMPHEO/+5xJ9u7dOyv66Jf92/8g9z+evsKgqp7aP53kS8A32+wuYFHP0IWtxjj1Z4Djk8xt7w56x4+133XAOoDh4eEaGRnp5/C57qaNXLOtvxzccUF/+5xJOp0O/b52s4H92/8g9z+evi4TJTmpZ/Y9wP47jTYB5yU5NskSYCnwXeAeYGm7c+gYuh8yb6qqAu4Ezm3rrwY29nNMkqT+HfTH4yRfA0aAE5PsBK4ARpKcQvcy0Q7gQwBV9VCSW4GHgX3AJVX1y7adS4EtwBxgfVU91HbxMeDmJJ8Gvg/cMFnNSZIm5qBhUFXnj1Ee9x/sqroKuGqM+mZg8xj1x+jebSRJmib+BrIkyTCQJBkGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAksQEwiDJ+iRPJ3mwp3ZCkq1Jtrfn+a2eJNcmGU3yQJLTetZZ3cZvT7K6p/62JNvaOtcmyWQ3KUl6ZRN5Z/AVYOUBtbXAHVW1FLijzQOcBSxtjzXA9dAND+AK4O3A6cAV+wOkjflgz3oH7kuSdIQdNAyq6tvAngPKq4ANbXoDcE5P/cbqugs4PslJwJnA1qraU1XPAluBlW3Z66vqrqoq4MaebUmSpsjcPtcbqqrdbfpJYKhNLwCe6Bm3s9Veqb5zjPqYkqyh+46DoaEhOp1Ofwc/Dy47eV9f6/a7z5lk7969s6KPftm//Q9y/+PpNwx+paoqSU3GwUxgX+uAdQDDw8M1MjLS13auu2kj12zrr/UdF/S3z5mk0+nQ72s3G9i//Q9y/+Pp926ip9olHtrz062+C1jUM25hq71SfeEYdUnSFOo3DDYB++8IWg1s7Klf2O4qWg483y4nbQFWJJnfPjheAWxpy15IsrzdRXRhz7YkSVPkoNdKknwNGAFOTLKT7l1BnwFuTXIx8GPgvW34ZuBsYBT4OXARQFXtSXIlcE8b96mq2v+h9Ifp3rE0D/hWe0iSptBBw6Cqzh9n0RljjC3gknG2sx5YP0b9XuCtBzsOSdKR428gS5IMA0mSYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJI4zDBIsiPJtiT3J7m31U5IsjXJ9vY8v9WT5Noko0keSHJaz3ZWt/Hbk6w+vJYkSYdqMt4Z/HFVnVJVw21+LXBHVS0F7mjzAGcBS9tjDXA9dMMDuAJ4O3A6cMX+AJEkTY0jcZloFbChTW8Azump31hddwHHJzkJOBPYWlV7qupZYCuw8ggclyRpHHMPc/0C/iNJAf9YVeuAoara3ZY/CQy16QXAEz3r7my18eovk2QN3XcVDA0N0el0+jrooXlw2cn7+lq3333OJHv37p0VffTL/u1/kPsfz+GGwTuraleS3wG2JvlB78KqqhYUk6KFzTqA4eHhGhkZ6Ws71920kWu29df6jgv62+dM0ul06Pe1mw3s3/4Huf/xHNZloqra1Z6fBr5B95r/U+3yD+356TZ8F7CoZ/WFrTZeXZI0RfoOgyTHJXnd/mlgBfAgsAnYf0fQamBjm94EXNjuKloOPN8uJ20BViSZ3z44XtFqkqQpcjiXiYaAbyTZv52vVtW/J7kHuDXJxcCPgfe28ZuBs4FR4OfARQBVtSfJlcA9bdynqmrPYRyXJOkQ9R0GVfUY8Adj1J8BzhijXsAl42xrPbC+32ORJB0efwNZkmQYSJIMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAksQMCoMkK5M8mmQ0ydrpPh5JGiQzIgySzAE+D5wFLAPOT7Jseo9KkgbHjAgD4HRgtKoeq6qXgJuBVdN8TJI0MOZO9wE0C4AneuZ3Am8/cFCSNcCaNrs3yaN97u9E4Kf9rJir+9zjzNJ3/7OE/dv/IPf/e2MVZ0oYTEhVrQPWHe52ktxbVcOTcEhHJfu3f/sf3P7HM1MuE+0CFvXML2w1SdIUmClhcA+wNMmSJMcA5wGbpvmYJGlgzIjLRFW1L8mlwBZgDrC+qh46grs87EtNRzn7H2z2r5dJVU33MUiSptlMuUwkSZpGhoEkabDCYLb+yYski5LcmeThJA8l+Uirn5Bka5Lt7Xl+qyfJte11eCDJaT3bWt3Gb0+yerp66keSOUm+n+SbbX5Jkrtbn7e0mxNIcmybH23LF/ds4/JWfzTJmdPUyiFLcnyS25L8IMkjSf5wkM5/kr9sX/sPJvlaklcP0vmfFFU1EA+6H0z/CHgTcAzw38Cy6T6uSertJOC0Nv064Id0/6zH3wJrW30tcHWbPhv4FhBgOXB3q58APNae57fp+dPd3yG8Dn8FfBX4Zpu/FTivTX8R+Is2/WHgi236POCWNr2sfV0cCyxpXy9zpruvCfa+AfjzNn0McPygnH+6v7T6ODCv57y/f5DO/2Q8Bumdwaz9kxdVtbuqvtemfwY8QvcbZBXdfyRoz+e06VXAjdV1F3B8kpOAM4GtVbWnqp4FtgIrp66T/iVZCLwb+HKbD/Au4LY25MD+978utwFntPGrgJur6hdV9TgwSvfrZkZL8gbgj4AbAKrqpap6jgE6/3TvjJyXZC7wGmA3A3L+J8sghcFYf/JiwTQdyxHT3vKeCtwNDFXV7rboSWCoTY/3WhzNr9HfA38N/F+bfyPwXFXta/O9vfyqz7b8+Tb+aO1/CfAT4J/aZbIvJzmOATn/VbUL+Dvgf+iGwPPAfQzO+Z8UgxQGs16S1wJfBz5aVS/0Lqvu++BZeR9xkj8Fnq6q+6b7WKbJXOA04PqqOhV4ke5loV+Z5ed/Pt2f6pcAvwscx9HzjmbGGKQwmNV/8iLJq+gGwU1VdXsrP9Xe/tOen2718V6Lo/U1egfwZ0l20L389y7gH+he/tj/i5W9vfyqz7b8DcAzHL397wR2VtXdbf42uuEwKOf/T4DHq+onVfW/wO10vyYG5fxPikEKg1n7Jy/a9c4bgEeq6rM9izYB++8IWQ1s7Klf2O4qWQ483y4nbAFWJJnfftpa0WozWlVdXlULq2ox3fP6X1V1AXAncG4bdmD/+1+Xc9v4avXz2t0mS4ClwHenqI2+VdWTwBNJfr+VzgAeZkDOP93LQ8uTvKZ9L+zvfyDO/6SZ7k+wp/JB9y6KH9K9S+AT0308k9jXO+leAngAuL89zqZ7HfQOYDvwn8AJbXzo/mdCPwK2AcM92/oA3Q/ORoGLpru3Pl6LEX59N9Gb6H4zjwL/Ahzb6q9u86Nt+Zt61v9Ee10eBc6a7n4Ooe9TgHvb18C/0r0baGDOP/A3wA+AB4F/pntH0MCc/8l4+OcoJEkDdZlIkjQOw0CSZBhIkgwDSRKGgSQJw0CShGEgSQL+Hz8ESgUOWx2AAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "repeated_names[repeated_names.issn_count > 3].issn_count.hist(bins=20)" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "repeated_names[(repeated_names.issn_count > 3) & (repeated_names.issn_count < 50)].issn_count.hist(bins=10)" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 81, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD4CAYAAAAO9oqkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWyklEQVR4nO3df7DddX3n8edryaIglYC0t26S2TA1tYNQXbgLdJ3tXqQDQR3DH9bBYUu0rJlt0Vo3swp2usyqzGCVpcJWdrKSBdoMkVK7ySiKGeSu05kGEX8Qfmi5ixGSRWINxI0/N+57/zifrIfLveTec27u+aZ5Pmbu3PP9fD/f73l9Q25e9/s933NIVSFJOrr9o1EHkCSNnmUgSbIMJEmWgSQJy0CSBCwZdYBBnXLKKbVy5cqBtv3BD37AS17ykoUNtMC6nrHr+aD7GbueD7qfsev5oHsZH3jggb+vql983oqqOiK/zjrrrBrUvffeO/C2i6XrGbuer6r7Gbuer6r7Gbuer6p7GYEv1wz/pnqZSJJkGUiSLANJEpaBJAnLQJKEZSBJwjKQJGEZSJKwDCRJHMEfRzGMHbv38bYrP7Poz7vz2jcs+nNK0lx4ZiBJsgwkSZaBJAnLQJKEZSBJwjKQJGEZSJKwDCRJzKEMkmxMsifJQzOsW5+kkpzSlpPkhiRTSR5Mcmbf3LVJHmtfa/vGz0qyo21zQ5Is1MFJkuZmLmcGtwCrpw8mWQFcADzRN3wRsKp9rQNuanNPBq4GzgHOBq5OclLb5ibgHX3bPe+5JEmH1yHLoKq+COydYdX1wHuB6htbA9zW/r/L24GlSV4OXAhsq6q9VfUMsA1Y3da9tKq2t/9R823AxUMdkSRp3gb6bKIka4DdVfX1aVd1lgFP9i3vamMvNL5rhvHZnncdvTMOxsbGmJycHCQ+Y8fB+jMODLTtMOaTd//+/QMf32Loej7ofsau54PuZ+x6PjgyMsIAZZDkeOD99C4RLaqq2gBsABgfH6+JiYmB9nPjpi1ct2PxP6Nv56UTc547OTnJoMe3GLqeD7qfsev5oPsZu54PjoyMMNjdRL8CnAp8PclOYDnwlSS/DOwGVvTNXd7GXmh8+QzjkqRFNO8yqKodVfVLVbWyqlbSu7RzZlV9B9gKXNbuKjoX2FdVTwF3AxckOam9cHwBcHdb9/0k57a7iC4DtizQsUmS5mgut5beDvwt8Moku5Jc/gLT7wIeB6aA/wr8PkBV7QU+CNzfvj7QxmhzPtG2+Z/AZwc7FEnSoA554byq3nqI9Sv7HhdwxSzzNgIbZxj/MnD6oXJIkg4f34EsSbIMJEmWgSQJy0CShGUgScIykCRhGUiSsAwkSVgGkiQsA0kSloEkCctAkoRlIEnCMpAkYRlIkrAMJElYBpIkLANJEpaBJIk5lEGSjUn2JHmob+wjSb6R5MEkf51kad+6q5JMJflmkgv7xle3sakkV/aNn5rkvjb+ySTHLuDxSZLmYC5nBrcAq6eNbQNOr6pfB/4OuAogyWnAJcCr2jYfT3JMkmOAPwMuAk4D3trmAnwYuL6qXgE8A1w+1BFJkubtkGVQVV8E9k4b+3xVHWiL24Hl7fEaYHNV/aSqvgVMAWe3r6mqeryqfgpsBtYkCfA64M62/a3AxcMdkiRpvpYswD5+F/hke7yMXjkctKuNATw5bfwc4GXAs33F0j//eZKsA9YBjI2NMTk5OVDgseNg/RkHDj1xgc0n7/79+wc+vsXQ9XzQ/Yxdzwfdz9j1fHBkZIQhyyDJHwEHgE0LE+eFVdUGYAPA+Ph4TUxMDLSfGzdt4bodC9GD87Pz0ok5z52cnGTQ41sMXc8H3c/Y9XzQ/YxdzwdHRkYYogySvA14I3B+VVUb3g2s6Ju2vI0xy/j3gKVJlrSzg/75kqRFMtCtpUlWA+8F3lRVP+xbtRW4JMmLkpwKrAK+BNwPrGp3Dh1L70Xmra1E7gXe3LZfC2wZ7FAkSYOay62ltwN/C7wyya4klwP/GfgFYFuSryX5LwBV9TBwB/AI8Dngiqr6Wfut/53A3cCjwB1tLsD7gH+XZIreawg3L+gRSpIO6ZCXiarqrTMMz/oPdlVdA1wzw/hdwF0zjD9O724jSdKI+A5kSZJlIEmyDCRJWAaSJCwDSRKWgSQJy0CShGUgScIykCRhGUiSsAwkSVgGkiQsA0kSloEkCctAkoRlIEnCMpAkYRlIkrAMJEnMoQySbEyyJ8lDfWMnJ9mW5LH2/aQ2niQ3JJlK8mCSM/u2WdvmP5Zkbd/4WUl2tG1uSJKFPkhJ0guby5nBLcDqaWNXAvdU1SrgnrYMcBGwqn2tA26CXnkAVwPnAGcDVx8skDbnHX3bTX8uSdJhdsgyqKovAnunDa8Bbm2PbwUu7hu/rXq2A0uTvBy4ENhWVXur6hlgG7C6rXtpVW2vqgJu69uXJGmRLBlwu7Gqeqo9/g4w1h4vA57sm7erjb3Q+K4ZxmeUZB29Mw7GxsaYnJwcLPxxsP6MAwNtO4z55N2/f//Ax7cYup4Pup+x6/mg+xm7ng+OjIwweBn8f1VVSWohwszhuTYAGwDGx8drYmJioP3cuGkL1+0Y+tDnbeelE3OeOzk5yaDHtxi6ng+6n7Hr+aD7GbueD46MjDD43URPt0s8tO972vhuYEXfvOVt7IXGl88wLklaRIOWwVbg4B1Ba4EtfeOXtbuKzgX2tctJdwMXJDmpvXB8AXB3W/f9JOe2u4gu69uXJGmRHPJaSZLbgQnglCS76N0VdC1wR5LLgW8Db2nT7wJeD0wBPwTeDlBVe5N8ELi/zftAVR18Ufr36d2xdBzw2fYlSVpEhyyDqnrrLKvOn2FuAVfMsp+NwMYZxr8MnH6oHJKkw8d3IEuSLANJkmUgScIykCRhGUiSsAwkSVgGkiQsA0kSloEkCctAkoRlIEnCMpAkYRlIkrAMJElYBpIkLANJEpaBJAnLQJLEkGWQ5D1JHk7yUJLbk7w4yalJ7ksyleSTSY5tc1/Ulqfa+pV9+7mqjX8zyYVDHpMkaZ4GLoMky4A/AMar6nTgGOAS4MPA9VX1CuAZ4PK2yeXAM238+jaPJKe17V4FrAY+nuSYQXNJkuZv2MtES4DjkiwBjgeeAl4H3NnW3wpc3B6vacu09ecnSRvfXFU/qapvAVPA2UPmkiTNw8BlUFW7gY8CT9ArgX3AA8CzVXWgTdsFLGuPlwFPtm0PtPkv6x+fYRtJ0iJYMuiGSU6i91v9qcCzwF/Su8xz2CRZB6wDGBsbY3JycqD9jB0H6884cOiJC2w+effv3z/w8S2GrueD7mfsej7ofsau54MjIyMMUQbAbwHfqqrvAiT5FPBaYGmSJe23/+XA7jZ/N7AC2NUuK50IfK9v/KD+bZ6jqjYAGwDGx8drYmJioOA3btrCdTuGOfTB7Lx0Ys5zJycnGfT4FkPX80H3M3Y9H3Q/Y9fzwZGREYZ7zeAJ4Nwkx7dr/+cDjwD3Am9uc9YCW9rjrW2Ztv4LVVVt/JJ2t9GpwCrgS0PkkiTN08C/HlfVfUnuBL4CHAC+Su+39s8Am5N8qI3d3Da5GfjzJFPAXnp3EFFVDye5g16RHACuqKqfDZpLkjR/Q10rqaqrgaunDT/ODHcDVdWPgd+eZT/XANcMk0WSNDjfgSxJsgwkSZaBJAnLQJKEZSBJwjKQJGEZSJKwDCRJWAaSJCwDSRKWgSQJy0CShGUgScIykCRhGUiSsAwkSVgGkiQsA0kSloEkiSHLIMnSJHcm+UaSR5P8RpKTk2xL8lj7flKbmyQ3JJlK8mCSM/v2s7bNfyzJ2mEPSpI0P8OeGXwM+FxV/RrwauBR4ErgnqpaBdzTlgEuAla1r3XATQBJTgauBs4BzgauPlggkqTFMXAZJDkR+E3gZoCq+mlVPQusAW5t024FLm6P1wC3Vc92YGmSlwMXAtuqam9VPQNsA1YPmkuSNH+pqsE2TF4DbAAeoXdW8ADwbmB3VS1tcwI8U1VLk3wauLaq/qatuwd4HzABvLiqPtTG/xj4UVV9dIbnXEfvrIKxsbGzNm/ePFD2PXv38fSPBtp0KGcsO3HOc/fv388JJ5xwGNMMp+v5oPsZu54Pup+x6/mgexnPO++8B6pqfPr4kiH2uQQ4E3hXVd2X5GP8/JIQAFVVSQZrmxlU1QZ6BcT4+HhNTEwMtJ8bN23huh3DHPpgdl46Mee5k5OTDHp8i6Hr+aD7GbueD7qfsev54MjICMO9ZrAL2FVV97XlO+mVw9Pt8g/t+562fjewom/75W1stnFJ0iIZuAyq6jvAk0le2YbOp3fJaCtw8I6gtcCW9ngrcFm7q+hcYF9VPQXcDVyQ5KT2wvEFbUyStEiGvVbyLmBTkmOBx4G30yuYO5JcDnwbeEubexfwemAK+GGbS1XtTfJB4P427wNVtXfIXJKkeRiqDKrqa8DzXoigd5YwfW4BV8yyn43AxmGySJIG5zuQJUmWgSTJMpAkYRlIkrAMJElYBpIkLANJEpaBJAnLQJKEZSBJwjKQJGEZSJKwDCRJWAaSJCwDSRKWgSQJy0CShGUgScIykCSxAGWQ5JgkX03y6bZ8apL7kkwl+WSSY9v4i9ryVFu/sm8fV7Xxbya5cNhMkqT5WYgzg3cDj/Ytfxi4vqpeATwDXN7GLweeaePXt3kkOQ24BHgVsBr4eJJjFiCXJGmOhiqDJMuBNwCfaMsBXgfc2abcClzcHq9py7T157f5a4DNVfWTqvoWMAWcPUwuSdL8LBly+z8F3gv8Qlt+GfBsVR1oy7uAZe3xMuBJgKo6kGRfm78M2N63z/5tniPJOmAdwNjYGJOTkwOFHjsO1p9x4NATF9h88u7fv3/g41sMXc8H3c/Y9XzQ/YxdzwdHRkYYogySvBHYU1UPJJlYsEQvoKo2ABsAxsfHa2JisKe9cdMWrtsxbA/O385LJ+Y8d3JykkGPbzF0PR90P2PX80H3M3Y9HxwZGWG4M4PXAm9K8nrgxcBLgY8BS5MsaWcHy4Hdbf5uYAWwK8kS4ETge33jB/VvI0laBAO/ZlBVV1XV8qpaSe8F4C9U1aXAvcCb27S1wJb2eGtbpq3/QlVVG7+k3W10KrAK+NKguSRJ83c4rpW8D9ic5EPAV4Gb2/jNwJ8nmQL20isQqurhJHcAjwAHgCuq6meHIZckaRYLUgZVNQlMtsePM8PdQFX1Y+C3Z9n+GuCahcgiSZo/34EsSbIMJEmWgSQJy0CShGUgScIykCRxeN5noA5aeeVnFnR/6884wNvmsM+d175hQZ9X0uHhmYEkyTKQJHmZaFHN51LNXC/DSNJC8MxAkmQZSJIsA0kSloEkCctAkoRlIEnCMpAkYRlIkrAMJEkMUQZJViS5N8kjSR5O8u42fnKSbUkea99PauNJckOSqSQPJjmzb19r2/zHkqwd/rAkSfMxzJnBAWB9VZ0GnAtckeQ04ErgnqpaBdzTlgEuAla1r3XATdArD+Bq4BzgbODqgwUiSVocA5dBVT1VVV9pj/838CiwDFgD3Nqm3Qpc3B6vAW6rnu3A0iQvBy4EtlXV3qp6BtgGrB40lyRp/lJVw+8kWQl8ETgdeKKqlrbxAM9U1dIknwauraq/aevuAd4HTAAvrqoPtfE/Bn5UVR+d4XnW0TurYGxs7KzNmzcPlHfP3n08/aOBNl00Y8fR6YxzzXfGshMPf5hZ7N+/nxNOOGFkz38oXc8H3c/Y9XzQvYznnXfeA1U1Pn186E8tTXIC8FfAH1bV93v//vdUVSUZvm1+vr8NwAaA8fHxmpiYGGg/N27awnU7uv2BrevPONDpjHPNt/PSicMfZhaTk5MM+ndkMXQ9H3Q/Y9fzwZGREYa8myjJP6ZXBJuq6lNt+Ol2+Yf2fU8b3w2s6Nt8eRubbVyStEiGuZsowM3Ao1X1n/pWbQUO3hG0FtjSN35Zu6voXGBfVT0F3A1ckOSk9sLxBW1MkrRIhrkO8Vrgd4AdSb7Wxt4PXAvckeRy4NvAW9q6u4DXA1PAD4G3A1TV3iQfBO5v8z5QVXuHyCVJmqeBy6C9EJxZVp8/w/wCrphlXxuBjYNmkSQNx3cgS5IsA0mSZSBJwjKQJLEAbzqTumrH7n287crPLPrz7rz2DYv+nNKwPDOQJFkGkiTLQJKEZSBJwheQdZitHMELuAetP2NkTy0dcTwzkCRZBpIkLxNJ/6As9GW59WccmNN7NXxvxZHPMwNJkmUgSfIykbTg5nqpZq6XYKTFYBlIOmKN6vOn4B/e6yReJpIkeWYgaXijenPhKN9YOKrLgYfrjKQzZwZJVif5ZpKpJFeOOo8kHU06UQZJjgH+DLgIOA14a5LTRptKko4enSgD4Gxgqqoer6qfApuBNSPOJElHjVTVqDOQ5M3A6qr6N235d4Bzquqd0+atA9a1xVcC3xzwKU8B/n7AbRdL1zN2PR90P2PX80H3M3Y9H3Qv4z+tql+cPnhEvYBcVRuADcPuJ8mXq2p8ASIdNl3P2PV80P2MXc8H3c/Y9XxwZGSE7lwm2g2s6Fte3sYkSYugK2VwP7AqyalJjgUuAbaOOJMkHTU6cZmoqg4keSdwN3AMsLGqHj6MTzn0paZF0PWMXc8H3c/Y9XzQ/YxdzwdHRsZuvIAsSRqtrlwmkiSNkGUgSTr6yiDJMUm+muTTo84ykyRLk9yZ5BtJHk3yG6PONF2S9yR5OMlDSW5P8uIR59mYZE+Sh/rGTk6yLclj7ftJHcz4kfbf+cEkf51k6Qgjzpixb936JJXklFFkaxlmzJfkXe3P8eEkfzKqfC3LTP+dX5Nke5KvJflykrNHmXE2R10ZAO8GHh11iBfwMeBzVfVrwKvpWNYky4A/AMar6nR6L/hfMtpU3AKsnjZ2JXBPVa0C7mnLo3QLz8+4DTi9qn4d+DvgqsUONc0tPD8jSVYAFwBPLHagaW5hWr4k59H7tIJXV9WrgI+OIFe/W3j+n+GfAP+xql4D/Ie23DlHVRkkWQ68AfjEqLPMJMmJwG8CNwNU1U+r6tmRhprZEuC4JEuA44H/NcowVfVFYO+04TXAre3xrcDFi5lpupkyVtXnq+pAW9xO7/01IzPLnyPA9cB7gZHebTJLvt8Drq2qn7Q5exY9WJ9ZMhbw0vb4REb88zKbo6oMgD+l95f6/444x2xOBb4L/Ld2KesTSV4y6lD9qmo3vd++ngCeAvZV1edHm2pGY1X1VHv8HWBslGHm4HeBz446xHRJ1gC7q+rro84yi18F/mWS+5L8jyT/fNSBZvCHwEeSPEnvZ2fUZ4AzOmrKIMkbgT1V9cCos7yAJcCZwE1V9c+AHzD6yxvP0a69r6FXXP8EeEmSfz3aVC+sevdPd/Ye6iR/BBwANo06S78kxwPvp3dpo6uWACcD5wL/HrgjSUYb6Xl+D3hPVa0A3kM78++ao6YMgNcCb0qyk96nor4uyV+MNtLz7AJ2VdV9bflOeuXQJb8FfKuqvltV/wf4FPAvRpxpJk8neTlA+z7SywezSfI24I3ApdW9N/38Cr3S/3r7uVkOfCXJL4801XPtAj5VPV+id9Y/she5Z7GW3s8JwF/S+5TmzjlqyqCqrqqq5VW1kt4Lnl+oqk79RltV3wGeTPLKNnQ+8MgII83kCeDcJMe338DOp2Mvcjdb6f0Q0r5vGWGWGSVZTe+y5Zuq6oejzjNdVe2oql+qqpXt52YXcGb7e9oV/x04DyDJrwLH0q1PCIXeawT/qj1+HfDYCLPMqhMfR6HneBewqX1G0+PA20ec5zmq6r4kdwJfoXdp46uM+O32SW4HJoBTkuwCrgaupXfJ4HLg28BbRpdw1oxXAS8CtrUrG9ur6t92KWNVdeaSxix/hhuBje1Wzp8Ca0d5hjVLxncAH2s3XPyYn38Mf6f4cRSSpKPnMpEkaXaWgSTJMpAkWQaSJCwDSRKWgSQJy0CSBPw/7n+kOOvFQywAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "repeated_names[(repeated_names.issn_count > 3) & (repeated_names.issn_count < 20)].issn_count.hist(bins=10)" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 82, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD4CAYAAAAO9oqkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAASuUlEQVR4nO3df4xd5X3n8fdncZMQaDAJ1Syy0RopVipSti0ZAVFW0RDvgoEII5VmiWgxiK73B02zK6SGrNRFSoJEpLBJQFtWVmBrsjQOddO1m9BQi2R2t39Ag5NsHKCIWWKKLYLT2DjrhCbr7nf/uI/Tm9HYvnPvnTt3Lu+XNJpznvOcc5/vfTTzmXPuuXdSVUiSXtv+wXIPQJK0/AwDSZJhIEkyDCRJGAaSJGDVcg+gX+ecc06tW7eur31/+MMfcsYZZwx3QMtkUmqZlDrAWsbRpNQBg9WyZ8+ev6mqX1ho24oNg3Xr1vHkk0/2te/s7CwzMzPDHdAymZRaJqUOsJZxNCl1wGC1JHnhRNu8TCRJMgwkSYaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJFbwO5AHsffAEW66/Usjf9x9d1098seUpF54ZiBJMgwkST2EQZIHkhxM8u2utjcn2Z3kufb97NaeJPckmUvyrSQXde2zufV/LsnmrvZ3JNnb9rknSYZdpCTp5Ho5M/gDYOO8ttuBx6pqPfBYWwe4EljfvrYA90EnPIA7gEuAi4E7jgdI6/Mvuvab/1iSpCV2yjCoqv8BHJrXvAnY1pa3Add2tT9YHY8Dq5OcC1wB7K6qQ1V1GNgNbGzb3lRVj1dVAQ92HUuSNCL93k00VVUvteXvAlNteQ3wYle//a3tZO37F2hfUJItdM44mJqaYnZ2tr/Bnw63XXisr30H0e94T+bo0aNLctxRm5Q6wFrG0aTUAUtXy8C3llZVJalhDKaHx9oKbAWYnp6ufv/Bw70P7eTuvaO/q3bfDTNDP+ak/NOOSakDrGUcTUodsHS19Hs30cvtEg/t+8HWfgA4r6vf2tZ2sva1C7RLkkao3zDYBRy/I2gzsLOr/cZ2V9GlwJF2OelR4PIkZ7cXji8HHm3bfpDk0nYX0Y1dx5Ikjcgpr5Uk+RwwA5yTZD+du4LuAh5OcgvwAvC+1v0R4CpgDvgRcDNAVR1K8lHga63fR6rq+IvS/4bOHUunA3/WviRJI3TKMKiq959g04YF+hZw6wmO8wDwwALtTwK/dKpxSJKWju9AliQZBpIkw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSQwYBkn+XZKnknw7yeeSvCHJ+UmeSDKX5PNJXtf6vr6tz7Xt67qO8+HW/mySKwasSZK0SH2HQZI1wO8A01X1S8BpwPXAx4FPVtVbgcPALW2XW4DDrf2TrR9JLmj7vR3YCPx+ktP6HZckafEGvUy0Cjg9ySrgjcBLwHuAHW37NuDatryprdO2b0iS1r69qn5cVd8B5oCLBxyXJGkRVvW7Y1UdSPIJ4K+BV4E/B/YAr1TVsdZtP7CmLa8BXmz7HktyBHhLa3+869Dd+/yMJFuALQBTU1PMzs72Nfap0+G2C4+duuOQ9Tvekzl69OiSHHfUJqUOsJZxNCl1wNLV0ncYJDmbzl/15wOvAH9E5zLPkqmqrcBWgOnp6ZqZmenrOPc+tJO79/Zdet/23TAz9GPOzs7S7/MwTialDrCWcTQpdcDS1TLIZaJ/Cnynqr5XVf8X+ALwLmB1u2wEsBY40JYPAOcBtO1nAd/vbl9gH0nSCAwSBn8NXJrkje3a/wbgaeCrwHWtz2ZgZ1ve1dZp279SVdXar293G50PrAf+coBxSZIWaZDXDJ5IsgP4OnAM+AadSzhfArYn+Vhru7/tcj/w2SRzwCE6dxBRVU8leZhOkBwDbq2qv+t3XJKkxRvownlV3QHcMa/5eRa4G6iq/hb49RMc507gzkHGIknqn+9AliQZBpIkw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSQwYBklWJ9mR5K+SPJPknUnenGR3kufa97Nb3yS5J8lckm8luajrOJtb/+eSbB60KEnS4gx6ZvBp4MtV9YvALwPPALcDj1XVeuCxtg5wJbC+fW0B7gNI8mbgDuAS4GLgjuMBIkkajb7DIMlZwLuB+wGq6idV9QqwCdjWum0Drm3Lm4AHq+NxYHWSc4ErgN1VdaiqDgO7gY39jkuStHipqv52TH4F2Ao8TeesYA/wQeBAVa1ufQIcrqrVSb4I3FVVf9G2PQZ8CJgB3lBVH2vtvwe8WlWfWOAxt9A5q2Bqauod27dv72vsBw8d4eVX+9p1IBeuOWvoxzx69Chnnnnm0I87apNSB1jLOJqUOmCwWi677LI9VTW90LZVA4xpFXAR8IGqeiLJp/n7S0IAVFUl6S9tFlBVW+kEENPT0zUzM9PXce59aCd37x2k9P7su2Fm6MecnZ2l3+dhnExKHWAt42hS6oClq2WQ1wz2A/ur6om2voNOOLzcLv/Qvh9s2w8A53Xtv7a1nahdkjQifYdBVX0XeDHJ21rTBjqXjHYBx+8I2gzsbMu7gBvbXUWXAkeq6iXgUeDyJGe3F44vb22SpBEZ9FrJB4CHkrwOeB64mU7APJzkFuAF4H2t7yPAVcAc8KPWl6o6lOSjwNdav49U1aEBxyVJWoSBwqCqvgks9GLEhgX6FnDrCY7zAPDAIGORJPXPdyBLkgwDSZJhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkScCq5R6AJtu627/UU7/bLjzGTT327dW+u64e6vGkSeaZgSTJMJAkGQaSJAwDSRKGgSQJw0CShGEgScIwkCQxhDBIclqSbyT5Yls/P8kTSeaSfD7J61r769v6XNu+rusYH27tzya5YtAxSZIWZxhnBh8Enula/zjwyap6K3AYuKW13wIcbu2fbP1IcgFwPfB2YCPw+0lOG8K4JEk9GigMkqwFrgY+09YDvAfY0bpsA65ty5vaOm37htZ/E7C9qn5cVd8B5oCLBxmXJGlxBv1sok8Bvwv8fFt/C/BKVR1r6/uBNW15DfAiQFUdS3Kk9V8DPN51zO59fkaSLcAWgKmpKWZnZ/sa9NTpnc/CGbV+x3syR48eXZLjDkuvz/NSzMlyPS/jPieLMSm1TEodsHS19B0GSd4LHKyqPUlmhjaik6iqrcBWgOnp6ZqZ6e9h731oJ3fvHf1n9O27YWbox5ydnaXf52EUev3wudsuPDb0OVmK57sX4z4nizEptUxKHbB0tQzy0/cu4JokVwFvAN4EfBpYnWRVOztYCxxo/Q8A5wH7k6wCzgK+39V+XPc+kqQR6Ps1g6r6cFWtrap1dF4A/kpV3QB8FbiuddsM7GzLu9o6bftXqqpa+/XtbqPzgfXAX/Y7LknS4i3FtZIPAduTfAz4BnB/a78f+GySOeAQnQChqp5K8jDwNHAMuLWq/m4JxiVJOoGhhEFVzQKzbfl5FrgbqKr+Fvj1E+x/J3DnMMYiSVo834EsSTIMJEmGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkszX86k7QM1t3+paEf87YLj3FTD8fdd9fVQ39sjZZnBpIkw0CSZBhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRIDhEGS85J8NcnTSZ5K8sHW/uYku5M8176f3dqT5J4kc0m+leSirmNtbv2fS7J58LIkSYsxyJnBMeC2qroAuBS4NckFwO3AY1W1HnisrQNcCaxvX1uA+6ATHsAdwCXAxcAdxwNEkjQafYdBVb1UVV9vy/8HeAZYA2wCtrVu24Br2/Im4MHqeBxYneRc4Apgd1UdqqrDwG5gY7/jkiQt3lBeM0iyDvhV4Algqqpeapu+C0y15TXAi1277W9tJ2qXJI1IqmqwAyRnAv8duLOqvpDklapa3bX9cFWdneSLwF1V9Ret/THgQ8AM8Iaq+lhr/z3g1ar6xAKPtYXOJSampqbesX379r7GfPDQEV5+ta9dB3LhmrOGfsyjR49y5plnDv24w7L3wJGe+k2dztDnZCme714s15z0+lwvRq/zslzPda/G/edkMQap5bLLLttTVdMLbRvofyAn+Tngj4GHquoLrfnlJOdW1UvtMtDB1n4AOK9r97Wt7QCdQOhun13o8apqK7AVYHp6umZmZhbqdkr3PrSTu/eO/t8/77thZujHnJ2dpd/nYRR6+f+50Plfu8Oek6V4vnuxXHPS63O9GL3Oy3I9170a95+TxViqWga5myjA/cAzVfUfuzbtAo7fEbQZ2NnVfmO7q+hS4Ei7nPQocHmSs9sLx5e3NknSiAzyp9i7gN8E9ib5Zmv798BdwMNJbgFeAN7Xtj0CXAXMAT8CbgaoqkNJPgp8rfX7SFUdGmBckqRF6jsM2rX/nGDzhgX6F3DrCY71APBAv2ORpFFbtwSX5XrxBxvPWJLj+g5kSZJhIEkyDCRJGAaSJAwDSRKGgSQJw0CShGEgScIwkCRhGEiSMAwkSRgGkiQMA0kShoEkCcNAkoRhIEnCMJAkYRhIkjAMJEkYBpIkDANJEoaBJAnDQJKEYSBJwjCQJGEYSJIwDCRJGAaSJAwDSRKGgSQJw0CShGEgSWKMwiDJxiTPJplLcvtyj0eSXkvGIgySnAb8J+BK4ALg/UkuWN5RSdJrx1iEAXAxMFdVz1fVT4DtwKZlHpMkvWakqpZ7DCS5DthYVb/V1n8TuKSqfntevy3Alrb6NuDZPh/yHOBv+tx33ExKLZNSB1jLOJqUOmCwWv5RVf3CQhtW9T+e0auqrcDWQY+T5Mmqmh7CkJbdpNQyKXWAtYyjSakDlq6WcblMdAA4r2t9bWuTJI3AuITB14D1Sc5P8jrgemDXMo9Jkl4zxuIyUVUdS/LbwKPAacADVfXUEj7kwJeaxsik1DIpdYC1jKNJqQOWqJaxeAFZkrS8xuUykSRpGRkGkqTJDoMkpyX5RpIvLrDt9Uk+3z7+4okk65ZhiD07RS03Jflekm+2r99ajjH2Ism+JHvbOJ9cYHuS3NPm5VtJLlqOcfaih1pmkhzpmpf/sBzj7EWS1Ul2JPmrJM8keee87StiXnqoY0XMSZK3dY3xm0l+kOTfzusz1DkZixeQl9AHgWeANy2w7RbgcFW9Ncn1wMeBfz7KwS3SyWoB+Pz8N+mNscuq6kRvmrkSWN++LgHua9/H1clqAfifVfXekY2mf58GvlxV17U7+t44b/tKmZdT1QErYE6q6lngV+CnH9dzAPiTed2GOicTe2aQZC1wNfCZE3TZBGxryzuADUkyirEtVg+1TJJNwIPV8TiwOsm5yz2oSZbkLODdwP0AVfWTqnplXrexn5ce61iJNgD/u6pemNc+1DmZ2DAAPgX8LvD/TrB9DfAidG5tBY4AbxnJyBbvU5y8FoBfa6eKO5Kcd5J+y62AP0+yp328yHw/nZdmf2sbR6eqBeCdSf5Xkj9L8vZRDm4Rzge+B/yXdinyM0nOmNdnJcxLL3XAypiTbtcDn1ugfahzMpFhkOS9wMGq2rPcYxlUj7X8KbCuqv4xsJu/P+MZR/+kqi6ic4p7a5J3L/eABnCqWr5O57Ngfhm4F/hvIx5fr1YBFwH3VdWvAj8EVuLHyPdSx0qZEwDapa5rgD9a6seayDAA3gVck2QfnU9AfU+S/zqvz08/AiPJKuAs4PujHGSPTllLVX2/qn7cVj8DvGO0Q+xdVR1o3w/SuQZ68bwuK+ajSU5VS1X9oKqOtuVHgJ9Lcs7IB3pq+4H9VfVEW99B55dqt5UwL6esYwXNyXFXAl+vqpcX2DbUOZnIMKiqD1fV2qpaR+cU6ytV9Rvzuu0CNrfl61qfsXsHXi+1zLtOeA2dF5rHTpIzkvz88WXgcuDb87rtAm5sd0pcChypqpdGPNRT6qWWJP/w+OtQSS6m8/M2dn9wVNV3gReTvK01bQCentdt7OellzpWypx0eT8LXyKCIc/JpN9N9DOSfAR4sqp20XmR6bNJ5oBDdH7RrhjzavmdJNcAx+jUctNyju0kpoA/aT+Lq4A/rKovJ/lXAFX1n4FHgKuAOeBHwM3LNNZT6aWW64B/neQY8Cpw/Tj+wdF8AHioXZZ4Hrh5hc7LqepYMXPS/sj4Z8C/7Gpbsjnx4ygkSZN5mUiStDiGgSTJMJAkGQaSJAwDSRKGgSQJw0CSBPx/YueE4LohpnIAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "repeated_names[(repeated_names.issn_count > 3) & (repeated_names.issn_count < 8)].issn_count.hist()" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 83, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAD4CAYAAADFAawfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAANX0lEQVR4nO3dbYxc91XH8e/BmybFGzkJKSuzidhUqipFtSDxqCQKL3YNTdMkat/0haMCKQ9aCUQVnoRsISH1BSJFVQVIiMZqCxW03ZY0gcpWiUKbJaoELrttGjtxTNzENLFS3PDgdoMEBA4v5q69a2ZmZx9m5qz9/UijvQ//O3Pu0Z3fzt65147MRJJU1/eNugBJUm8GtSQVZ1BLUnEGtSQVZ1BLUnFjg3jS66+/Pqempgbx1EPz2muvsXPnzlGXUZb96c7e9GZ/OltcXHw1M9/Uad1AgnpqaoqFhYVBPPXQzM/PMz09PeoyyrI/3dmb3uxPZxHxT93WeepDkoozqCWpOINakoozqCWpOINakoozqCWpuL4uz4uI08D3gP8BXs/M1iCLkiRdsJ7rqGcy89WBVSJJ6shTH5JUXPTzHwdExIvAvwEJPJSZhzqMmQVmASYmJvbOzc1tqKBjZ85taLvN2jO5a9X80tIS4+PjI6llO7A/3dmb3uxPZzMzM4vdTiv3G9STmXkmIn4QeBz4QGY+2W18q9XKjd5CPnXgyIa226zTD96zat7bXHuzP93Zm97sT2cR0TWo+zr1kZlnmp9ngUeBt29deZKkXtYM6ojYGRFXL08DdwLHB12YJKmtn6s+JoBHI2J5/Kcz868HWpUk6bw1gzozXwB+ZAi1SJI68PI8SSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4voO6ojYERFfj4jDgyxIkrTaej5RPwCcGFQhkqTO+grqiLgBuAf42GDLkSRdLDJz7UERDwO/C1wN/EZm3tthzCwwCzAxMbF3bm5uQwUdO3NuQ9tt1p7JXavml5aWGB8fH0kt24H96c7e9GZ/OpuZmVnMzFandWNrbRwR9wJnM3MxIqa7jcvMQ8AhgFarldPTXYf29P4DRza03Wadft/0qvn5+Xk2ug+XA/vTnb3pzf6sXz+nPu4A3h0Rp4E5YF9E/PlAq5IknbdmUGfmwcy8ITOngP3AlzPzpwZemSQJ8DpqSSpvzXPUK2XmPDA/kEokSR35iVqSijOoJak4g1qSijOoJak4g1qSijOoJak4g1qSijOoJak4g1qSijOoJak4g1qSijOoJak4g1qSijOoJak4g1qSijOoJak4g1qSijOoJak4g1qSijOoJak4g1qSijOoJak4g1qSijOoJak4g1qSijOoJak4g1qSijOoJak4g1qSijOoJak4g1qSijOoJam4NYM6Iq6KiK9GxDci4pmI+OAwCpMktY31MeY/gX2ZuRQRVwBfiYgvZubfD7g2SRJ9BHVmJrDUzF7RPHKQRUmSLujrHHVE7IiIp4CzwOOZeXSgVUmSzov2B+Y+B0dcAzwKfCAzj1+0bhaYBZiYmNg7Nze3oYKOnTm3oe02a8/krlXzS0tLjI+PD+W1q+zzegyzP9uNvenN/nQ2MzOzmJmtTuvWFdQAEfHbwH9k5oe7jWm1WrmwsLC+KhtTB45saLvNOv3gPavm5+fnmZ6eHsprV9nn9Rhmf7Ybe9Ob/eksIroGdT9Xfbyp+SRNRLwReAfw3JZWKEnqqp+rPnYDn4yIHbSD/XOZeXiwZUmSlvVz1cfTwC1DqEWS1IF3JkpScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScWsGdUTcGBFPRMSzEfFMRDwwjMIkSW1jfYx5Hfj1zPxaRFwNLEbE45n57IBrkyTRxyfqzHwlM7/WTH8POAFMDrowSVJbZGb/gyOmgCeBt2Xmdy9aNwvMAkxMTOydm5vbUEHHzpzb0HabtWdy16r5paUlxsfHh/LaVfZ5PYbZn+3G3vRmfzqbmZlZzMxWp3V9B3VEjAN/C/xOZj7Sa2yr1cqFhYV1FwowdeDIhrbbrNMP3rNqfn5+nunp6aG8dpV9Xo9h9me7sTe92Z/OIqJrUPd11UdEXAF8HvjUWiEtSdpa/Vz1EcDHgROZ+ZHBlyRJWqmfT9R3AD8N7IuIp5rH3QOuS5LUWPPyvMz8ChBDqEWS1IF3JkpScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBW3ZlBHxCci4mxEHB9GQZKk1fr5RP2nwF0DrkOS1MWaQZ2ZTwL/OoRaJEkdRGauPShiCjicmW/rMWYWmAWYmJjYOzc3t6GCjp05t6HtNmvP5K5V80tLS4yPjw/ltavs83oMsz/bjb3pbRj92Y7vqZmZmcXMbHVat2VBvVKr1cqFhYV1Fbls6sCRDW23WacfvGfV/Pz8PNPT00N57Sr7vB7D7M92Y296G0Z/tuN7KiK6BrVXfUhScQa1JBXXz+V5nwH+DnhrRLwcET8/+LIkScvG1hqQmfcNoxBJUmee+pCk4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4gxqSSrOoJak4voK6oi4KyJORsSpiDgw6KIkSResGdQRsQP4I+BdwM3AfRFx86ALkyS19fOJ+u3Aqcx8ITP/C5gD3jPYsiRJy8b6GDMJvLRi/mXgxy4eFBGzwGwzuxQRJzdf3vDEh/7fouuBV4dfyfB02Of1uOT7swn2prdLtj+bfE/9cLcV/QR1XzLzEHBoq55v1CJiITNbo66jKvvTnb3pzf6sXz+nPs4AN66Yv6FZJkkagn6C+h+At0TETRHxBmA/8IXBliVJWrbmqY/MfD0ifhl4DNgBfCIznxl4ZaN3yZzGGRD705296c3+rFNk5qhrkCT14J2JklScQS1JxV02QR0RN0bEExHxbEQ8ExEPNMuvi4jHI+L55ue1zfKIiD9sbpt/OiJuXfFc9zfjn4+I+0e1T1stInZExNcj4nAzf1NEHG168Nnmy2Qi4spm/lSzfmrFcxxslp+MiHeOaFcGIiKuiYiHI+K5iDgREbd7/FwQEb/avLeOR8RnIuIqj6EtkpmXxQPYDdzaTF8N/CPtW+J/DzjQLD8AfKiZvhv4IhDAbcDRZvl1wAvNz2ub6WtHvX9b1KNfAz4NHG7mPwfsb6Y/CvxiM/1LwEeb6f3AZ5vpm4FvAFcCNwHfBHaMer+2sD+fBH6hmX4DcI3Hz/neTAIvAm9ccey832Noi/o76gJGtuPwV8A7gJPA7mbZbuBkM/0QcN+K8Seb9fcBD61Yvmrcdn3Qvj7+S8A+4HATMK8CY83624HHmunHgNub6bFmXAAHgYMrnvP8uO3+AHY1QRQXLff4yfNB/VLzC2isOYbe6TG0NY/L5tTHSs2fWbcAR4GJzHylWfVtYKKZ7nTr/GSP5dvd7wO/CfxvM/8DwL9n5uvN/Mr9PN+DZv25Zvyl2htof7r7DvAnzemhj0XETjx+AMjMM8CHgW8Br9A+JhbxGNoSl11QR8Q48HngVzLzuyvXZftX+GV3vWJE3AuczczFUddS2BhwK/DHmXkL8BrtUx3nXa7HD0Bzbv49tH+h/RCwE7hrpEVdQi6roI6IK2iH9Kcy85Fm8T9HxO5m/W7gbLO8263zl+It9XcA746I07T/dcR9wB8A10TE8k1RK/fzfA+a9buAf+HS7M2yl4GXM/NoM/8w7eD2+Gn7SeDFzPxOZv438Ajt48pjaAtcNkEdEQF8HDiRmR9ZseoLwPI37/fTPne9vPxnmm/vbwPONX/iPgbcGRHXNp8i7myWbVuZeTAzb8jMKdpf7Hw5M98HPAG8txl2cW+We/beZnw2y/c33+jfBLwF+OqQdmOgMvPbwEsR8dZm0U8Az+Lxs+xbwG0R8f3Ne225Px5DW2HUJ8mH9QB+nPafpU8DTzWPu2mfF/sS8DzwN8B1zfig/R8mfBM4BrRWPNfPAaeax8+Oet+2uE/TXLjq48203ySngL8ArmyWX9XMn2rWv3nF9r/V9Owk8K5R788W9+ZHgYXmGPpL2ldtePxc2K8PAs8Bx4E/o33lhsfQFjy8hVySirtsTn1I0nZlUEtScQa1JBVnUEtScQa1JBVnUEtScQa1JBX3fwsmzuohajGeAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "repeated_names[repeated_names.issn_count > 1000].issn_count.hist(bins=10)" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameissn_count
3511Bulletin.2797
7652Newsletter.2773
8338Rapport.1062
23716Proceedings.1565
45931Annual report /1382
45999Annual report.9520
46056Annuaire.1263
47310Rapport annuel.2811
72341Annual report1074
\n", - "
" - ], - "text/plain": [ - " name issn_count\n", - "3511 Bulletin. 2797\n", - "7652 Newsletter. 2773\n", - "8338 Rapport. 1062\n", - "23716 Proceedings. 1565\n", - "45931 Annual report / 1382\n", - "45999 Annual report. 9520\n", - "46056 Annuaire. 1263\n", - "47310 Rapport annuel. 2811\n", - "72341 Annual report 1074" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "repeated_names[repeated_names.issn_count > 1000]" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameissn_count
103Bulletin d'information.696
3230Bulletin de liaison.512
3511Bulletin.2797
7652Newsletter.2773
8338Rapport.1062
23716Proceedings.1565
45886Report.764
45931Annual report /1382
45999Annual report.9520
46056Annuaire.1263
46462Jaarverslag.678
47235Rapport d'activité.690
47310Rapport annuel.2811
49388Jahresbericht.528
72341Annual report1074
121778Alumni directory /511
129027Bulletin municipal.521
150771˜La œLettre.630
169246Local climatological data.613
269569Estimates.680
\n", - "
" - ], - "text/plain": [ - " name issn_count\n", - "103 Bulletin d'information. 696\n", - "3230 Bulletin de liaison. 512\n", - "3511 Bulletin. 2797\n", - "7652 Newsletter. 2773\n", - "8338 Rapport. 1062\n", - "23716 Proceedings. 1565\n", - "45886 Report. 764\n", - "45931 Annual report / 1382\n", - "45999 Annual report. 9520\n", - "46056 Annuaire. 1263\n", - "46462 Jaarverslag. 678\n", - "47235 Rapport d'activité. 690\n", - "47310 Rapport annuel. 2811\n", - "49388 Jahresbericht. 528\n", - "72341 Annual report 1074\n", - "121778 Alumni directory / 511\n", - "129027 Bulletin municipal. 521\n", - "150771 ˜La œLettre. 630\n", - "169246 Local climatological data. 613\n", - "269569 Estimates. 680" - ] - }, - "execution_count": 85, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "repeated_names[repeated_names.issn_count > 500]" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameissn_count
103Bulletin d'information.696
2676Newsletter /290
3230Bulletin de liaison.512
3511Bulletin.2797
3941Boletín.227
.........
534658Relatório e contas.248
606501Bildung und Beruf regional.292
1013518Vies de famille.222
1117647Country risk service.271
1236478Performance report for the period ending March...217
\n", - "

72 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " name issn_count\n", - "103 Bulletin d'information. 696\n", - "2676 Newsletter / 290\n", - "3230 Bulletin de liaison. 512\n", - "3511 Bulletin. 2797\n", - "3941 Boletín. 227\n", - "... ... ...\n", - "534658 Relatório e contas. 248\n", - "606501 Bildung und Beruf regional. 292\n", - "1013518 Vies de famille. 222\n", - "1117647 Country risk service. 271\n", - "1236478 Performance report for the period ending March... 217\n", - "\n", - "[72 rows x 2 columns]" - ] - }, - "execution_count": 86, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "repeated_names[repeated_names.issn_count > 200]" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameissn_count
103Bulletin d'information.696
2676Newsletter /290
3230Bulletin de liaison.512
3511Bulletin.2797
3941Boletín.227
.........
1315194Country commerce.120
1327255Bible studies for life.159
1805527LexisNexis practice guide.110
2637306Operational risk report.119
2659477Interempresas net.115
\n", - "

204 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " name issn_count\n", - "103 Bulletin d'information. 696\n", - "2676 Newsletter / 290\n", - "3230 Bulletin de liaison. 512\n", - "3511 Bulletin. 2797\n", - "3941 Boletín. 227\n", - "... ... ...\n", - "1315194 Country commerce. 120\n", - "1327255 Bible studies for life. 159\n", - "1805527 LexisNexis practice guide. 110\n", - "2637306 Operational risk report. 119\n", - "2659477 Interempresas net. 115\n", - "\n", - "[204 rows x 2 columns]" - ] - }, - "execution_count": 87, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "repeated_names[repeated_names.issn_count > 100]" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameissn_count
2˜The œpublishers weekly.2
3Publishers weekly2
6Activitas Nervosa Superior.2
12Library journal.2
24Acta cardiologica.2
.........
2938851AAPS introductions in the pharmaceutical scien...2
2938852AAPS introductions in the pharmaceutical scien...2
2938856Verzeichniss der Werke lebender Künstler auf d...2
2938857IEEE Advanced Information Management, Communic...2
2938858Products finishing México (Print)2
\n", - "

586466 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " name issn_count\n", - "2 ˜The œpublishers weekly. 2\n", - "3 Publishers weekly 2\n", - "6 Activitas Nervosa Superior. 2\n", - "12 Library journal. 2\n", - "24 Acta cardiologica. 2\n", - "... ... ...\n", - "2938851 AAPS introductions in the pharmaceutical scien... 2\n", - "2938852 AAPS introductions in the pharmaceutical scien... 2\n", - "2938856 Verzeichniss der Werke lebender Künstler auf d... 2\n", - "2938857 IEEE Advanced Information Management, Communic... 2\n", - "2938858 Products finishing México (Print) 2\n", - "\n", - "[586466 rows x 2 columns]" - ] - }, - "execution_count": 88, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "repeated_names" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If a name matches a repeated name exactly or fuzzy matches to a repeated name and there is not other information available, the match status must be ambigious." - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['2735-9298', '2735-928X']" - ] - }, - "execution_count": 89, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mapping[\"Nigerian Journal of Wildlife Management\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These are two ISSN refering to the same journal." - ] - }, - { - "cell_type": "code", - "execution_count": 90, - "metadata": {}, - "outputs": [], - "source": [ - "import requests" - ] - }, - { - "cell_type": "code", - "execution_count": 91, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'@graph': [{'@id': 'http://id.loc.gov/vocabulary/countries/nr',\n", - " 'label': 'Nigeria'},\n", - " {'@id': 'organization/ISSNCenter#_e',\n", - " '@type': 'http://schema.org/Organization'},\n", - " {'@id': 'resource/ISSN-L/2735-9298',\n", - " 'identifiedBy': 'resource/ISSN/2735-928X#ISSN-L'},\n", - " {'@id': 'resource/ISSN/2735-928X',\n", - " '@type': ['http://id.loc.gov/ontologies/bibframe/Work',\n", - " 'http://id.loc.gov/ontologies/bibframe/Instance',\n", - " 'http://schema.org/Periodical'],\n", - " 'identifiedBy': ['resource/ISSN/2735-928X#ISSN-L',\n", - " 'resource/ISSN/2735-928X#ISSN',\n", - " 'resource/ISSN/2735-928X#KeyTitle'],\n", - " 'mainTitle': 'Nigerian Journal of Wildlife Management',\n", - " 'otherPhysicalFormat': 'resource/ISSN/2735-9298',\n", - " 'title': 'resource/ISSN/2735-928X#KeyTitle',\n", - " 'format': 'vocabularies/medium#Online',\n", - " 'identifier': '2735-928X',\n", - " 'isFormatOf': 'resource/ISSN/2735-9298',\n", - " 'type': 'http://marc21rdf.info/terms/formofmaterial#a',\n", - " 'http://purl.org/ontology/bibo/issn': '2735-928X',\n", - " 'isPartOf': 'resource/ISSN-L/2735-9298',\n", - " 'issn': '2735-928X',\n", - " 'name': ['Nigerian JOurnal of Wildlife Management (Ondo. Online)',\n", - " 'Nigerian Journal of Wildlife Management'],\n", - " 'publication': 'resource/ISSN/2735-928X#ReferencePublicationEvent',\n", - " 'url': 'http://www.wildlifesociety.ng.org/'},\n", - " {'@id': 'resource/ISSN/2735-928X#ISSN',\n", - " '@type': 'http://id.loc.gov/ontologies/bibframe/Issn',\n", - " 'status': 'vocabularies/IdentifierStatus#Valid',\n", - " 'value': '2735-928X'},\n", - " {'@id': 'resource/ISSN/2735-928X#ISSN-L',\n", - " '@type': 'http://id.loc.gov/ontologies/bibframe/IssnL',\n", - " 'status': 'vocabularies/IdentifierStatus#Valid',\n", - " 'value': '2735-9298'},\n", - " {'@id': 'resource/ISSN/2735-928X#KeyTitle',\n", - " '@type': ['http://id.loc.gov/ontologies/bibframe/Identifier',\n", - " 'http://id.loc.gov/ontologies/bibframe/KeyTitle'],\n", - " 'value': 'Nigerian JOurnal of Wildlife Management (Ondo. Online)'},\n", - " {'@id': 'resource/ISSN/2735-928X#Record',\n", - " '@type': 'http://schema.org/CreativeWork',\n", - " 'status': 'vocabularies/RecordStatus#Register',\n", - " 'modified': '20200808163600.0',\n", - " 'mainEntity': 'resource/ISSN/2735-928X',\n", - " 'wasAttributedTo': 'organization/ISSNCenter#_e'},\n", - " {'@id': 'resource/ISSN/2735-928X#ReferencePublicationEvent',\n", - " '@type': 'http://schema.org/PublicationEvent',\n", - " 'location': 'http://id.loc.gov/vocabulary/countries/nr'}],\n", - " '@context': {'status': {'@id': 'http://id.loc.gov/ontologies/bibframe/status',\n", - " '@type': '@id'},\n", - " 'value': {'@id': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#value'},\n", - " 'publication': {'@id': 'http://schema.org/publication', '@type': '@id'},\n", - " 'mainTitle': {'@id': 'http://id.loc.gov/ontologies/bibframe/mainTitle'},\n", - " 'title': {'@id': 'http://id.loc.gov/ontologies/bibframe/title',\n", - " '@type': '@id'},\n", - " 'name': {'@id': 'http://schema.org/name'},\n", - " 'issn': {'@id': 'http://schema.org/issn'},\n", - " 'format': {'@id': 'http://purl.org/dc/elements/1.1/format', '@type': '@id'},\n", - " 'url': {'@id': 'http://schema.org/url'},\n", - " 'identifiedBy': {'@id': 'http://id.loc.gov/ontologies/bibframe/identifiedBy',\n", - " '@type': '@id'},\n", - " 'otherPhysicalFormat': {'@id': 'http://id.loc.gov/ontologies/bibframe/otherPhysicalFormat',\n", - " '@type': '@id'},\n", - " 'isPartOf': {'@id': 'http://schema.org/isPartOf', '@type': '@id'},\n", - " 'type': {'@id': 'http://purl.org/dc/terms/type', '@type': '@id'},\n", - " 'identifier': {'@id': 'http://purl.org/dc/elements/1.1/identifier'},\n", - " 'isFormatOf': {'@id': 'http://purl.org/dc/terms/isFormatOf', '@type': '@id'},\n", - " 'wasAttributedTo': {'@id': 'http://www.w3.org/ns/prov#wasAttributedTo',\n", - " '@type': '@id'},\n", - " 'mainEntity': {'@id': 'http://schema.org/mainEntity', '@type': '@id'},\n", - " 'modified': {'@id': 'http://purl.org/dc/terms/modified',\n", - " '@type': 'http://www.w3.org/2001/XMLSchema#dateTime'},\n", - " 'location': {'@id': 'http://schema.org/location', '@type': '@id'},\n", - " 'label': {'@id': 'http://www.w3.org/2000/01/rdf-schema#label'}}}" - ] - }, - "execution_count": 91, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "requests.get(\"https://portal.issn.org/resource/ISSN/2735-928X?format=json\").json()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Interestingly, most (80%) journal do not seem to have the distinction between electronic and print. But it may be that names are not used consistently." - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.8004443220991548" - ] - }, - "execution_count": 94, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(unique_name) / len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 96, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameissn_count
1848393Aux petits bonheurs des enfants1
1147789Statistik om boliger ... i Aalborg Kommune.1
2789832Mur (Regensburg)1
35201Boletín de la Sociedad Vasco-Navarra de pediat...1
1955109Cucina piatti unici.1
1039ABD1
2595157Tribuna quinzenal de Mataró1
416131Japan high tech review1
2885449Revista cósmica calavera.1
343232First days.1
\n", - "
" - ], - "text/plain": [ - " name issn_count\n", - "1848393 Aux petits bonheurs des enfants 1\n", - "1147789 Statistik om boliger ... i Aalborg Kommune. 1\n", - "2789832 Mur (Regensburg) 1\n", - "35201 Boletín de la Sociedad Vasco-Navarra de pediat... 1\n", - "1955109 Cucina piatti unici. 1\n", - "1039 ABD 1\n", - "2595157 Tribuna quinzenal de Mataró 1\n", - "416131 Japan high tech review 1\n", - "2885449 Revista cósmica calavera. 1\n", - "343232 First days. 1" - ] - }, - "execution_count": 96, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "unique_name.sample(n=10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example \"Fieldiana\"\n", - "\n", - "* \"Fieldiana. Anthropology\"\n", - "* https://www.jstor.org/journal/fieldianaanthro" - ] - }, - { - "cell_type": "code", - "execution_count": 97, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['2162-4321', '0071-4739']" - ] - }, - "execution_count": 97, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mapping[\"Fieldiana. Anthropology\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "However, jstor reports two ISSN: INSSP: 0071-4739, EISSN: 2162-4321 - but ISSN.org does not know about it?" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": {}, - "outputs": [], - "source": [ - "unique_issn = set([item for v in mapping.values() for item in v])" - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 99, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\"2162-4321\" in unique_issn" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fieldiana. ['2162-4267', '2162-4321', '2162-4291', '0096-2651', '0015-0746', '0097-3572', '2163-7105', '0071-4739', '0015-0754', '2158-5520', '0096-0438', '2162-4348']\n", - "Fieldiana. Anthropology ['2162-4321', '0071-4739']\n", - "Fieldiana. Anthropology (Online) ['2162-4321', '0071-4739']\n" - ] - } - ], - "source": [ - "for k, v in mapping.items():\n", - " if \"2162-4321\" in v:\n", - " print(k, v)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As an example: when given a string like \"Fieldiana\" we would return ambiguous. But \"Fieldiana. (Online)\" might be matched to '2162-4321'. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Other Examples" - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameissn_count
571163Afhandling.2
1800364Advances in e-business research series (Online)2
2476171Journal of Automatic Control (Online)2
2756754Gerencia de riesgos y seguros.2
2799421˜The œskinny (Scotland ed. Online)2
1254438Accessible news (Print)2
2177530Biomathematical and biomechanical modeling of ...2
2093431˜Le œJura socialiste (Saint-Claude)2
2752857European Journal of Formal Sciences and Engine...2
389738Austral journal of veterinary sciences (Online)2
\n", - "
" - ], - "text/plain": [ - " name issn_count\n", - "571163 Afhandling. 2\n", - "1800364 Advances in e-business research series (Online) 2\n", - "2476171 Journal of Automatic Control (Online) 2\n", - "2756754 Gerencia de riesgos y seguros. 2\n", - "2799421 ˜The œskinny (Scotland ed. Online) 2\n", - "1254438 Accessible news (Print) 2\n", - "2177530 Biomathematical and biomechanical modeling of ... 2\n", - "2093431 ˜Le œJura socialiste (Saint-Claude) 2\n", - "2752857 European Journal of Formal Sciences and Engine... 2\n", - "389738 Austral journal of veterinary sciences (Online) 2" - ] - }, - "execution_count": 101, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "repeated_names.sample(n=10)" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['1608-3318', '1070-3284']" - ] - }, - "execution_count": 102, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mapping[\"Russian journal of coordination chemistry.\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A new question: How many journals are listed under different names, yet still refer to the same journal?" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['0733-2289']" - ] - }, - "execution_count": 103, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mapping[\"San Bernardino County popular street atlas\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Journal of ..." - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameissn_count
983Journal of vocational behavior.2
984Journal of vocational behavior (Print)2
1213Journal of American Concrete Institute.1
1214Journal of the American Concrete Institute1
1317Journal of the American Dental Hygienists' Ass...1
.........
2938472Journal of surgical procedures and case reports1
2938580Journal of Epidermiological Society of Nigeria...2
2938581Journal of Epidermiological Society of Nigeria2
2938795Journal of Practicing Teachers1
2938796Journal of Practicing Teachers (Uyo. Online)1
\n", - "

40257 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " name issn_count\n", - "983 Journal of vocational behavior. 2\n", - "984 Journal of vocational behavior (Print) 2\n", - "1213 Journal of American Concrete Institute. 1\n", - "1214 Journal of the American Concrete Institute 1\n", - "1317 Journal of the American Dental Hygienists' Ass... 1\n", - "... ... ...\n", - "2938472 Journal of surgical procedures and case reports 1\n", - "2938580 Journal of Epidermiological Society of Nigeria... 2\n", - "2938581 Journal of Epidermiological Society of Nigeria 2\n", - "2938795 Journal of Practicing Teachers 1\n", - "2938796 Journal of Practicing Teachers (Uyo. Online) 1\n", - "\n", - "[40257 rows x 2 columns]" - ] - }, - "execution_count": 104, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df.name.str.startswith(\"Journal of\")]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/Journal_Names.pdf b/notebooks/Journal_Names.pdf deleted file mode 100644 index 04f392b..0000000 Binary files a/notebooks/Journal_Names.pdf and /dev/null differ diff --git a/notebooks/Journal_Names.zip b/notebooks/Journal_Names.zip deleted file mode 100644 index 00e96a3..0000000 Binary files a/notebooks/Journal_Names.zip and /dev/null differ diff --git a/notes/bm.md b/notes/bm.md deleted file mode 100644 index b6c3a7c..0000000 --- a/notes/bm.md +++ /dev/null @@ -1,19 +0,0 @@ -# b/m - -## cluster, verify - -* git pull deploy, aitio -* cluster example -* test with - -## regatedl match results - -* https://git.archive.org/martin/regatedl, in fixtures: https://git.archive.org/martin/regatedl/-/tree/master/fixtures - -## the temp data structure - -* should go in ~/.cache/... -* sqlite; TSV - -## tigris ideas - diff --git a/notes/clustering.md b/notes/clustering.md deleted file mode 100644 index 3f6312c..0000000 --- a/notes/clustering.md +++ /dev/null @@ -1,102 +0,0 @@ -# Clustering - -Original dataset: - -``` -$ sha1sum release_export_expanded.json.zst -fa7ce335e27bbf6ccee227992ecd9b860e8e36af release_export_expanded.json.zst - -$ zstdcat -T0 release_export_expanded.json.zst | wc -l -``` - -Various clusters (title, title normalized, title nysiis (New York State -Identification and Intelligence System, ...): - -``` -$ zstdcat -T0 release_export_expanded.json.zst | fuzzycat-cluster -t title > cluster_title.json -``` - -Parallel (TODO: use `--pipepart`): - -``` -$ zstdcat -T0 release_export_expanded.json.zst | \ - parallel --tmpdir /bigger/tmp --roundrobin --pipe -j 16 \ - fuzzycat-cluster --tmpdir /bigger/tmp -t title > cluster_title.json -``` - -Numbers of clusters: - -``` - 141022216 cluster_title.json - 134709771 cluster_title_normalized.json - 119829458 cluster_title_nysiis.json -``` - -The number of duplicate record goes up as number of clusters go down: - -``` - 2858088 cluster_title_dups.json - 5818143 cluster_title_normalized_dups.json - 6274940 cluster_title_nysiis_dups.json -``` - -# Cluster numbers - -Using normalized title as example: - -* 4306860 have cluster size 2, 1511283 have cluster size 3 or larger - -``` - size len -count 5818143.000 5818143.000 -mean 4.350 52.120 -std 196.347 35.026 -min 2.000 0.000 -25% 2.000 24.000 -50% 2.000 46.000 -75% 3.000 72.000 -max 151383.000 11686.000 -``` - -Around 448170 clusters with size 5 or more (with some example titles): - -``` -Medical Notes -日本鉄鋼協会第97回講演大会講演概要 -Boutades -Allergic Contact Dermatitis -Comité international -Incontinence -Efficient Uncertainty Minimization for Fuzzy Spectral Clustering -Early Intervention -CURRENT READINGS IN NUCLEAR MEDICINE -Nannocystis exedens -``` - -Grouping. API, hide. - -* gnu parallel; top, htop; how much; "chunks"; read one line; "pipeart"; - batching; "read from a file"; scan a file; "chunking" - -# TODO - -* [ ] do a SS like clustering, using title and author ngrams -* [ ] cluster by doi without "vX" suffix - -# Verification - -* we only need to look at identified duplicates, which will be a few millions -* we want fast access to all release JSON blob via ident, maybe do a - "fuzzycat-cache" that copies relevant files into the fs, e.g. -"~/.cache/fuzzycat/releases/d9/e4d4be49faafc750563351a126e7bafe29.json or via microblob (but http we do not need), or sqlite3 (https://www.sqlite.org/fasterthanfs.html) - -For verification we need to have the cached json blobs in some fast, -thread-safe store. Estimated: 1K/s accesses, we still would need a few hours -for a run. - -* [ ] find all ids we need, generate cache, maybe reduce number of fields -* [ ] run verification on each cluster; generate a file of same format of - "verified" clusters; take note the clustering and verification method - -Overall, we can combine various clustering and verification methods. We can -also put together a list of maybe 100-200 test cases and evaluate methods. diff --git a/notes/general.md b/notes/general.md deleted file mode 100644 index 03f6ec4..0000000 --- a/notes/general.md +++ /dev/null @@ -1,197 +0,0 @@ -# fuzzycat (wip) - -Fuzzy matching publications for [fatcat](https://fatcat.wiki). - -* [fuzzycat](https://pypi.org/project/fuzzycat/) - -Note: This is currently work-in-progress. - -## Motivation - -Most of the results on sites like [Google -Scholar](https://scholar.google.com/scholar?q=fuzzy+matching) group -publications into clusters. Each cluster represents one publication, abstracted -from its concrete representation as a link to a PDF. - -We call the abstract publication -[work](https://guide.fatcat.wiki/entity_work.html) and the concrete instance a -[release](https://guide.fatcat.wiki/entity_release.html). One goal is to group -releases under works and to implement a versions feature (self-match). Another -goal is to have support for matching of external lists (e.g. title lists or -other document) to the existing records. - -This repository contains both generic code for matching as well as fatcat -specific code using the fatcat openapi client. - -## Running and Deployment - -We defer more packaging polish until the code stabilizes a bit more. For now: - -``` -$ git clone git@github.com:miku/fuzzycat.git && cd fuzzycat -$ pipenv install --deploy -$ pipenv run python -m fuzzycat.main -``` - -For the future, an independent [pex](https://github.com/pantsbuild/pex) or -[shiv](https://github.com/linkedin/shiv) executable would be a convenient -option to allow execution from any directory. - -## Datasets - -A few relevant datasets are: - -* release and container metadata from a bulk fatcat export, e.g. - [https://archive.org/details/fatcat_bulk_exports_2020-08-05](https://archive.org/details/fatcat_bulk_exports_2020-08-05) -* issn journal level data, via [issnlister](https://github.com/miku/issnlister) -* journal abbreviation lists - -## Matching approaches - -![](static/approach.png) - -## Performance data points - -### Against elasticsearch - -Candidate generation via elasticsearch, 40 parallel queries, sustained speed at -about 17857 queries per hour, that is around 5 queries/s. - -``` -$ time cat ~/data/researchgate/x04 | \ - parallel -j40 --pipe -N 1 ./fatcatx_rg_unmatched.py - \ - > ~/data/researchgate/x04_results.ndj -... -real 3409m16.442s -user 29177m5.516s -sys 4927m3.277s -``` - -### Without a search index - -Candidate grouping for self-match can be done locally by extracting a key per -document, then a group by (via sort and uniq). Clustering 150M docs took about -607min (around 4k docs/s, no verification step). - -## Data issues - -### A republished article - -* [https://fatcat.wiki/release/search?q=%22The+doctor+with+seven+billion+patients%22](https://fatcat.wiki/release/search?q=%22The+doctor+with+seven+billion+patients%22) - -There is "student BMJ" and "BMJ" - this (html) article (interview) has been -first published on "sbmj" (Published 07 July 2011), then "bmj" (Published 10 -August 2011). - -> Notes; Originally published as: Student BMJ 2011;19:d3983 - -* https://www.bmj.com/content/343/sbmj.d3983 -* https://www.bmj.com/content/343/bmj.d4964 - -It is essentially the same text, same title, author, just different DOI and -probably a different recorded date. - -Generic pattern "republication" duplicate: - -* metadata mostly same, except date and doi - -### Common title - -Probably a few thousand very common short titles. - -* [https://fatcat.wiki/release/search?q=%22Book+Reviews%22](https://fatcat.wiki/release/search?q=%22Book+Reviews%22) (238852) - -Some authors do this regularly: - -* [https://fatcat.wiki/release/search?q=%22Book+Reviews%22+%22william%22+%22michael%22](https://fatcat.wiki/release/search?q=%22Book+Reviews%22+%22william%22+%22michael%22) (398) - -Different DOI, so we know it is different. - -More examples: - -* [https://fatcat.wiki/release/search?q=%22errata%22](https://fatcat.wiki/release/search?q=%22errata%22) (37680) -* [https://fatcat.wiki/release/search?q=%22Einleitung%22](https://fatcat.wiki/release/search?q=%22Einleitung%22) (68005) -* [https://fatcat.wiki/release/search?q=%22Notes%22](https://fatcat.wiki/release/search?q=%22Notes%22) (1507705) -* [https://fatcat.wiki/release/search?q=%22Letters+to+the+Editor%22](https://fatcat.wiki/release/search?q=%22Letters+to+the+Editor%22) (30976) - -### Title with extra data - -* like ISBN, ISSN, price and all kind of extra metadata -* [https://fatcat.wiki/release/search?q=title%3A%22ISBN%22](https://fatcat.wiki/release/search?q=title%3A%22ISBN%22) -* titles typically get longer: [https://fatcat.wiki/release/olxswrilxfci3ibb3bg5xhstr4](https://fatcat.wiki/release/olxswrilxfci3ibb3bg5xhstr4) -* some of these are actually "reviews", e.g. [https://fatcat.wiki/release/4blc5mfc5bfaxkofuletqxuzp4](https://fatcat.wiki/release/4blc5mfc5bfaxkofuletqxuzp4) - -Another example: - -* too [long](https://fatcat.wiki/release/hewmq4afvnew7pwttvulzguubu), original suggested citation seems to be: - -> Parker, S. and Kerrod, R. (2002), "Children’s) Space Busters (1st) Looking at Stars (2nd)", Reference Reviews, Vol. 16 No. 5, pp. 26-27. https://doi.org/10.1108/rr.2002.16.5.26.252 - -### Sometimes a title will be ambiguous - -For example given a title "Shakespeare in Tokyo" we would have to always return "ambiguous", as there are at least two separate publication with that name: - -* [https://fatcat.wiki/release/search?q=%22Shakespeare+in+Tokyo%22](https://fatcat.wiki/release/search?q=%22Shakespeare+in+Tokyo%22) - -This is similar to journal names, where some journal names will always be ambiguous. - -### Versions - -* same title, same authors, "vX" doi -* [https://fatcat.wiki/release/search?q=%22Self-similarity+analysis+of+the+non-linear%22](https://fatcat.wiki/release/search?q=%22Self-similarity+analysis+of+the+non-linear%22) - -Sometimes, we have a couple of preprint versions, plus a published version (with a slightly different title): - -* [https://fatcat.wiki/release/search?q=%22Time-periodic+solutions+of+massive%22](https://fatcat.wiki/release/search?q=%22Time-periodic+solutions+of+massive%22) - -### Almost same - -* same author, maybe year -* different DOI -* title almost the same, e.g. [MassIVE MSV000085583 - Aedes aegypti protein profile and proteome analysis](https://fatcat.wiki/release/search?q=%22Aedes+aegypti+protein+profile+and+proteome+analysis%22) - -### Duplication by different granularity - -* [https://fatcat.wiki/release/search?q=%22Volkshochschule+Leipzig%22](https://fatcat.wiki/release/search?q=%22Volkshochschule+Leipzig%22) (20308) -* contains both yearly entries, as well as "DOI per page", - [https://fatcat.wiki/release/r734v367nza4tl37j6d74rfqo4](https://fatcat.wiki/release/r734v367nza4tl37j6d74rfqo4); -could group pages under "container" of yearly release? -* We have [one container](https://github.com/internetarchive/fatcat/blob/4f80b87722d64f27c985f0040ea177269b6e028b/fatcat-openapi2.yml#L704-L709) per release, currently. - -### Partial titles - -A metadata title might differ from the full title. - -* [https://fatcat.wiki/release/search?q=%22Brain-derived+neurotrophic+factor%22](https://fatcat.wiki/release/search?q=%22Brain-derived+neurotrophic+factor%22) - -Here, the [release](https://fatcat.wiki/release/2vi655gcejffhnzzbkkcnjpscm) points to two PDFs, one is an article, the other a weekly report (summary). - -### Exact duplicates - -* [https://fatcat.wiki/release/search?q=%22WEIGHTED+LIPSCHITZ+ESTIMATES+FOR+COMMUTATORS+ON+WEIGHTED+MORREY-HERZ+SPACES%22](https://fatcat.wiki/release/search?q=%22WEIGHTED+LIPSCHITZ+ESTIMATES+FOR+COMMUTATORS+ON+WEIGHTED+MORREY-HERZ+SPACES%22) - -### Difference in Subtitle (invisible) - -Subtitle is not visible metadata, all same, except for the DOI and the page number. Different. - -* [https://fatcat.wiki/release/search?q=%22Slip+in+tungsten+monocarbide%22](https://fatcat.wiki/release/search?q=%22Slip+in+tungsten+monocarbide%22) - -### The "what a difference a char makes" case - -Typically a yearly report, or "part 1", "part 2", like this: - -* [https://fatcat.wiki/release/search?q=%22The+Use+of+Bone+Age+in+Clinical+Practice+%22](https://fatcat.wiki/release/search?q=%22The+Use+of+Bone+Age+in+Clinical+Practice+%22) - -DOI differs and could hard code some patterns. - -### Published to two sites - -An article can have multiple DOI, e.g. when republished by a site that gives out DOI, e.g. researchgate. Example: - -* [Effect of Chlorophyll and Anthocyanin on the Secondary Bonds of Poly Vinyl Chloride](https://fatcat.wiki/release/search?q=%22Effect+of+Chlorophyll+and+Anthocyanin+on+the+Secondary+Bonds+of+Poly+Vinyl+Chloride+%22) - -> https://doi.org/10.11648/j.ijmsa.s.2015040201.15, https://doi.org/10.13140/rg.2.1.2398.3606 - -Probably many "10.13140" prefixed DOI has at least another DOI. - -Some might be "rg-only", like this: [https://fatcat.wiki/release/search?q=%22Marco+de+trabajo+basado+en+los+datos+enlazados+para%22](https://fatcat.wiki/release/search?q=%22Marco+de+trabajo+basado+en+los+datos+enlazados+para%22) diff --git a/notes/todo.md b/notes/todo.md deleted file mode 100644 index 2c548b0..0000000 --- a/notes/todo.md +++ /dev/null @@ -1,23 +0,0 @@ -# Todo - -## Releases - -* [ ] stats of cases: versions, exact title matches; common prefixes (e.g. "XYZ Report 20XX", ...) - -## Containers - -* [ ] create notebook on duplicates -* [ ] static mapping, that is efficient to store, maybe via: https://github.com/pytries/marisa-trie - -If matching only by name, we need to lookup a (exact) name. - -* need a mapping from "name" and "name variants" to journal "issnl" - -## Bulk - -* [ ] download export - -## Performance - -* provide some fast path - diff --git a/notes/workflow.md b/notes/workflow.md deleted file mode 100644 index 8cdd817..0000000 --- a/notes/workflow.md +++ /dev/null @@ -1,60 +0,0 @@ -# Workflow - -Separate problem in half, first find clusters, then examine clusters (as -proposed). - -## Finding clusters - -* group by raw exact title -* group by lowercase title -* group by slug title -* group by ngram title and authors -* group by ngram title (prefix, suffix) and authors -* group by elasticsearch -* group by doi without vX prefix -* group by soundex -* group by a simhash over the record - -As for performance, the feature needs to be calculated in one pass, then the -grouping reduces to a sort, in a second pass. - -The output could be a TSV file, with method and then release identifiers. - -``` -rawt o3utonw5qzhddo7l4lmwptgeey nnpmnwln7be2zb5hd2qanq3r7q -``` - -Or jsonlines for a bit of structure (e.g. method, ids) - -``` -{"m": "rawt", "c": ["o3utonw5qzhddo7l4lmwptgeey", "nnpmnwln7be2zb5hd2qanq3r7q"]} -``` - -``` -$ zstdcat -T0 release_export_expanded.json.zst | fuzzycat-cluster -g > clusters.json -``` - -### Performance considerations - -* [orjson](https://github.com/ijl/orjson), [pysimdjson](https://github.com/TkTech/pysimdjson) - -## Format - -Options: - -* emit minimal cluster information, e.g. method description and actual identifiers -* emit methods, and for each cluster item some core fields (title, author, id, date) - -## Examine cluster - -There will be various methods by which to examine the cluster as well. - -We need to fetch releases by identifier (API, but use "hide"), this can be the -full record or some partial record that has been cached somewhere. - -The input is then a list of releases and the output would be a equally sized or -smaller cluster of releases which we assume represent the same record. - -Apart from that, there may be different relations, e.g. not the exact same -thing, but something, that has an interval to it, like some thing that mostly -differs in year? diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index b0471b7..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,3 +0,0 @@ -[build-system] -requires = ["setuptools", "wheel"] -build-backend = "setuptools.build_meta:__legacy__" \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 7a39369..0000000 --- a/setup.py +++ /dev/null @@ -1,50 +0,0 @@ -import setuptools - -from fuzzycat import __version__ - -with open("README.md", "r") as fh: - long_description = fh.read() - - setuptools.setup( - name="fuzzycat", - version=__version__, - author="Martin Czygan", - author_email="martin@archive.org", - description="Fuzzy matching utilities for scholarly metadata", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/miku/fuzzycat", - packages=setuptools.find_packages(), - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - ], - python_requires=">=3.5", - zip_safe=False, - entry_points={"console_scripts": [ - "fuzzycat=fuzzycat.main:main" - ]}, - install_requires=[ - "elasticsearch>=7", - "ftfy", - "fuzzy", - "pydantic", - "toml", - "unidecode>=0.10", - # "fatcat-openapi-client", - # "simhash", - ], - extras_require={"dev": [ - "ipython", - "isort", - "jupyter", - "matplotlib", - "pandas", - "pylint", - "pytest", - "pytest-cov", - "twine", - "yapf", - ],}, - ) -- cgit v1.2.3