From 5a307829670888fedd696e6220c84feed1fe6b64 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 12 Aug 2020 14:20:41 +0200 Subject: stub tool: fuzzycat-issn to generate test data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit currently: fuzzycat-issn --make-pairs will generate a TSV with (issn, a, b) example, e.g. ... 0011-9717 Detskaâ literatura. Детская литература. 0011-9717 Detskaâ literatura. Detskaâ literatura 0011-9717 Детская литература. Detskaâ literatura 0011-6637 Darbininkas. Darbininkas 0012-0820 deutsche Tabakbau deutsche Tabakbau. 0011-5444 Daily Kent stater. Daily Kent stater ... The idea is that these names per definition denote the same journal. We might even have a fixed lookup table, since some variants involve multiple scripts (and there are only around 2M names in total). Currently 1992176 pairs can be generated. --- fuzzycat/issn.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 fuzzycat/issn.py diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py new file mode 100644 index 0000000..8108885 --- /dev/null +++ b/fuzzycat/issn.py @@ -0,0 +1,69 @@ +""" +Munge the ISSN data so we get some container name test data out of it. + + ... + "issn": "0000-0019", + "mainTitle": "The publishers weekly.", + "name": [ + "The publishers weekly.", + "Publishers weekly" + ], + +Public data from ISSN via: +https://portal.issn.org/resource/ISSN/0874-2308?format=json, and issnlister +(https://github.com/miku/issnlister) to aggregate. + +The dataset contains naming variants in "name". + +Q1: How many of these variants would our matching algorithm detect? + +For that, we need a dataset that generates pairs (a, b) from all names (the +mainTitle is just one of the name). + +""" + +import argparse +import sys +import os +import json +import itertools + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("file", + default=sys.stdin, + type=argparse.FileType("r"), + help="public data from issn, one JSON object per line") + parser.add_argument("--make-pairs", action="store_true") + + args = parser.parse_args() + + if args.make_pairs: + for line in args.file: + line = line.strip() + try: + doc = json.loads(line) + except json.decoder.JSONDecodeError as exc: + print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr) + continue + for item in doc.get("@graph", []): + issn = item.get("issn", "") + if not issn: + continue + if len(issn) != 9: + continue + if issn[4] != "-": + continue + names = item.get("name") + if not names: + continue + if isinstance(names, str): + names = [names] + if not isinstance(names, list): + raise ValueError("expected a list: {} {}".format(names, type(names))) + if len(names) < 2: + continue + + for a, b in itertools.combinations(names, 2): + print("{}\t{}\t{}".format(issn, a, b)) -- cgit v1.2.3