From 5a307829670888fedd696e6220c84feed1fe6b64 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 12 Aug 2020 14:20:41 +0200
Subject: stub tool: fuzzycat-issn to generate test data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

currently: fuzzycat-issn --make-pairs will generate a TSV with (issn, a, b) example, e.g.

    ...
    0011-9717       Detskaâ literatura.     Детская литература.
    0011-9717       Detskaâ literatura.     Detskaâ literatura
    0011-9717       Детская литература.     Detskaâ literatura
    0011-6637       Darbininkas.            Darbininkas
    0012-0820       deutsche Tabakbau       deutsche Tabakbau.
    0011-5444       Daily Kent stater.      Daily Kent stater
    ...

The idea is that these names per definition denote the same journal. We
might even have a fixed lookup table, since some variants involve
multiple scripts (and there are only around 2M names in total).

Currently 1992176 pairs can be generated.
---
 fuzzycat/issn.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 fuzzycat/issn.py

diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
new file mode 100644
index 0000000..8108885
--- /dev/null
+++ b/fuzzycat/issn.py
@@ -0,0 +1,69 @@
+"""
+Munge the ISSN data so we get some container name test data out of it.
+
+      ...
+      "issn": "0000-0019",
+      "mainTitle": "<U+0098>The <U+009C>publishers weekly.",
+      "name": [
+        "<U+0098>The <U+009C>publishers weekly.",
+        "Publishers weekly"
+      ],
+
+Public data from ISSN via:
+https://portal.issn.org/resource/ISSN/0874-2308?format=json, and issnlister
+(https://github.com/miku/issnlister) to aggregate.
+
+The dataset contains naming variants in "name".
+
+Q1: How many of these variants would our matching algorithm detect?
+
+For that, we need a dataset that generates pairs (a, b) from all names (the
+mainTitle is just one of the name).
+
+"""
+
+import argparse
+import sys
+import os
+import json
+import itertools
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("file",
+                        default=sys.stdin,
+                        type=argparse.FileType("r"),
+                        help="public data from issn, one JSON object per line")
+    parser.add_argument("--make-pairs", action="store_true")
+
+    args = parser.parse_args()
+
+    if args.make_pairs:
+        for line in args.file:
+            line = line.strip()
+            try:
+                doc = json.loads(line)
+            except json.decoder.JSONDecodeError as exc:
+                print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
+                continue
+            for item in doc.get("@graph", []):
+                issn = item.get("issn", "")
+                if not issn:
+                    continue
+                if len(issn) != 9:
+                    continue
+                if issn[4] != "-":
+                    continue
+                names = item.get("name")
+                if not names:
+                    continue
+                if isinstance(names, str):
+                    names = [names]
+                if not isinstance(names, list):
+                    raise ValueError("expected a list: {} {}".format(names, type(names)))
+                if len(names) < 2:
+                    continue
+
+                for a, b in itertools.combinations(names, 2):
+                    print("{}\t{}\t{}".format(issn, a, b))
-- 
cgit v1.2.3