From ed3ef392cc840956a85e1e32ce443dde103d98db Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 12 Aug 2020 15:51:30 +0200 Subject: add journal name notebook --- notebooks/Journal_Names.html | 14635 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 14635 insertions(+) create mode 100644 notebooks/Journal_Names.html (limited to 'notebooks/Journal_Names.html') diff --git a/notebooks/Journal_Names.html b/notebooks/Journal_Names.html new file mode 100644 index 0000000..a87394a --- /dev/null +++ b/notebooks/Journal_Names.html @@ -0,0 +1,14635 @@ + + + + +Journal_Names + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+

Journal Names

Questions in the context of fuzzy matching.

+
    +
  • How many journal names appear more than once?
  • +
  • What is the average length of the duplicated names vs the unique names?
  • +
+

Input file is a single larger JSON, mapping names to issns.

+
{
+  "Acta Orientalia.": [
+    "0001-6438"
+  ],
+  "Acta Orientalia (København)": [
+    "0001-6438"
+  ],
+..
+
+ +
+
+
+
+
+
In [4]:
+
+
+
import json
+import pandas as pd
+
+ +
+
+
+ +
+
+
+
In [5]:
+
+
+
with open("../data/name_to_issn.json") as f:
+    mapping = json.load(f)
+
+ +
+
+
+ +
+
+
+
+

We have about 3M keys.

+ +
+
+
+
+
+
In [7]:
+
+
+
len(mapping)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[7]:
+ + + + +
+
2929727
+
+ +
+ +
+
+ +
+
+
+
In [21]:
+
+
+
df = pd.DataFrame(((k, len(v)) for k, v in mapping.items()), columns=["name", "issn_count"])
+
+ +
+
+
+ +
+
+
+
In [25]:
+
+
+
len(df)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[25]:
+ + + + +
+
2929727
+
+ +
+ +
+
+ +
+
+
+
In [26]:
+
+
+
df.head()
+
+ +
+
+
+ +
+
+ + +
+ +
Out[26]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
nameissn_count
0Acta Orientalia.1
1Acta Orientalia (København)1
2˜The œpublishers weekly.1
3Publishers weekly1
4ASMT news1
+
+
+ +
+ +
+
+ +
+
+
+
In [31]:
+
+
+
unique_name = df[df.issn_count == 1]
+
+ +
+
+
+ +
+
+
+
In [32]:
+
+
+
repeated_names = df[df.issn_count > 1]
+
+ +
+
+
+ +
+
+
+
In [34]:
+
+
+
len(repeated_names)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[34]:
+ + + + +
+
194241
+
+ +
+ +
+
+ +
+
+
+
In [33]:
+
+
+
len(repeated_names) / len(df)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[33]:
+ + + + +
+
0.06630003409874026
+
+ +
+ +
+
+ +
+
+
+
+

About 6% (or 194241) names are repeated.

+ +
+
+
+
+
+
In [35]:
+
+
+
repeated_names.describe()
+
+ +
+
+
+ +
+
+ + +
+ +
Out[35]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
issn_count
count194241.000000
mean3.197523
std25.081605
min2.000000
25%2.000000
50%2.000000
75%2.000000
max8980.000000
+
+
+ +
+ +
+
+ +
+
+
+
+

Which name is shared by over 8000 ISSN?

+ +
+
+
+
+
+
In [40]:
+
+
+
repeated_names.iloc[repeated_names.issn_count.argmax()] # Annual report.
+
+ +
+
+
+ +
+
+ + +
+ +
Out[40]:
+ + + + +
+
name          Annual report.
+issn_count              8980
+Name: 45907, dtype: object
+
+ +
+ +
+
+ +
+
+
+
+

It is the "Annual report."

+ +
+
+
+
+
+
In [42]:
+
+
+
mapping["Annual report."][:10]
+
+ +
+
+
+ +
+
+ + +
+ +
Out[42]:
+ + + + +
+
['0706-537X',
+ '1186-7957',
+ '2324-1926',
+ '1445-9248',
+ '0872-3982',
+ '1714-1524',
+ '1037-8812',
+ '0225-0241',
+ '1327-6344',
+ '0702-7702']
+
+ +
+ +
+
+ +
+
+
+
+

On average a repeated name will point to 3 ISSN. About 24k names point to more than 3 ISSN.

+ +
+
+
+
+
+
In [45]:
+
+
+
len(repeated_names[repeated_names.issn_count > 3])
+
+ +
+
+
+ +
+
+ + +
+ +
Out[45]:
+ + + + +
+
24107
+
+ +
+ +
+
+ +
+
+
+
In [49]:
+
+
+
repeated_names[repeated_names.issn_count > 3].sample(n=10)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[49]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
nameissn_count
322100Philosophica.17
183928Edad de oro.4
294309Horoskop.10
517039˜Le œGrand journal.11
1664616Caleidoscop şcolar.4
258430˜La œFeuille.34
309546˜The œWilson quarterly.4
795859Introductory research essay4
1470838Publicaciones del SEMYR.4
657041˜Le œKiosque.14
+
+
+ +
+ +
+
+ +
+
+
+
In [50]:
+
+
+
mapping["Philosophica."]
+
+ +
+
+
+ +
+
+ + +
+ +
Out[50]:
+ + + + +
+
['1285-9133',
+ '1480-4670',
+ '1487-5349',
+ '1724-6598',
+ '2183-0134',
+ '2538-693X',
+ '2610-8933',
+ '2035-8326',
+ '2295-9084',
+ '1517-8889',
+ '2249-5053',
+ '2420-9198',
+ '2654-9263',
+ '2610-8925',
+ '1158-9574',
+ '0872-4784',
+ '0379-8402']
+
+ +
+ +
+
+ +
+
+
+
In [61]:
+
+
+
repeated_names[repeated_names.issn_count > 3].issn_count.hist(bins=20)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[61]:
+ + + + +
+
<AxesSubplot:>
+
+ +
+ +
+ +
+ + + + +
+ +
+ +
+ +
+
+ +
+
+
+
In [60]:
+
+
+
repeated_names[(repeated_names.issn_count > 3) & (repeated_names.issn_count < 50)].issn_count.hist(bins=10)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[60]:
+ + + + +
+
<AxesSubplot:>
+
+ +
+ +
+ +
+ + + + +
+ +
+ +
+ +
+
+ +
+
+
+
In [62]:
+
+
+
repeated_names[(repeated_names.issn_count > 3) & (repeated_names.issn_count < 20)].issn_count.hist(bins=10)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[62]:
+ + + + +
+
<AxesSubplot:>
+
+ +
+ +
+ +
+ + + + +
+ +
+ +
+ +
+
+ +
+
+
+
In [64]:
+
+
+
repeated_names[(repeated_names.issn_count > 3) & (repeated_names.issn_count < 8)].issn_count.hist()
+
+ +
+
+
+ +
+
+ + +
+ +
Out[64]:
+ + + + +
+
<AxesSubplot:>
+
+ +
+ +
+ +
+ + + + +
+ +
+ +
+ +
+
+ +
+
+
+
In [70]:
+
+
+
repeated_names[repeated_names.issn_count > 1000].issn_count.hist(bins=10)
+
+ +
+
+
+ +
+
+ + +
+ +
Out[70]:
+ + + + +
+
<AxesSubplot:>
+
+ +
+ +
+ +
+ + + + +
+ +
+ +
+ +
+
+ +
+
+
+
In [71]:
+
+
+
repeated_names[repeated_names.issn_count > 1000]
+
+ +
+
+
+ +
+
+ + +
+ +
Out[71]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
nameissn_count
3499Bulletin.2752
7632Newsletter.2715
8317Rapport.1050
23662Proceedings.1403
45839Annual report /1090
45907Annual report.8980
45964Annuaire.1260
47217Rapport annuel.2656
+
+
+ +
+ +
+
+ +
+
+
+
In [72]:
+
+
+
repeated_names[repeated_names.issn_count > 500]
+
+ +
+
+
+ +
+
+ + +
+ +
Out[72]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
nameissn_count
102Bulletin d'information.693
3218Bulletin de liaison.510
3499Bulletin.2752
7632Newsletter.2715
8317Rapport.1050
23662Proceedings.1403
45794Report.743
45839Annual report /1090
45907Annual report.8980
45964Annuaire.1260
46370Jaarverslag.675
47142Rapport d'activité.660
47217Rapport annuel.2656
49289Jahresbericht.518
57558Annual report760
121599Alumni directory /511
128827Bulletin municipal.521
150529˜La œLettre.623
168933Local climatological data.613
269004Estimates.535
+
+
+ +
+ +
+
+ +
+
+
+
In [75]:
+
+
+
repeated_names[repeated_names.issn_count > 200]
+
+ +
+
+
+ +
+
+ + +
+ +
Out[75]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
nameissn_count
102Bulletin d'information.693
2665Newsletter /259
3218Bulletin de liaison.510
3499Bulletin.2752
3926Boletín.216
.........
425644Rapport d'activité ...394
532500Relatório e contas.247
603144Bildung und Beruf regional.292
1006131Vies de famille.222
1110247Country risk service.271
+

66 rows × 2 columns

+
+
+ +
+ +
+
+ +
+
+
+
In [76]:
+
+
+
repeated_names[repeated_names.issn_count > 100]
+
+ +
+
+
+ +
+
+ + +
+ +
Out[76]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
nameissn_count
102Bulletin d'information.693
2665Newsletter /259
3218Bulletin de liaison.510
3499Bulletin.2752
3926Boletín.216
.........
1306798Country commerce.120
1318569Bible studies for life.159
1796742LexisNexis practice guide.101
2628387Operational risk report.119
2650557Interempresas net.108
+

191 rows × 2 columns

+
+
+ +
+ +
+
+ +
+
+
+
In [82]:
+
+
+
repeated_names
+
+ +
+
+
+ +
+
+ + +
+ +
Out[82]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
nameissn_count
5Activitas Nervosa Superior.2
11Library journal.2
23Acta cardiologica.2
26Actualidad económica.3
31Acta Ornithologica.3
.........
2929626Modern machine shop México.2
2929635Lecture notes in control and information scien...2
2929646Critical Studies in Dance Leadership and Inclu...2
2929691Nigerian Journal of Wildlife Management2
2929702Verzeichniss der Kunstwerke lebender Künstler,...2
+

194241 rows × 2 columns

+
+
+ +
+ +
+
+ +
+
+
+
+

If a name matches a repeated name exactly or fuzzy matches to a repeated name and there is not other information available, the match status must be ambigious.

+ +
+
+
+
+
+
In [ ]:
+
+
+
 
+
+ +
+
+
+ +
+
+
+ + + + + + -- cgit v1.2.3