diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-08-12 14:20:41 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-08-12 14:24:31 +0200 |
commit | 5a307829670888fedd696e6220c84feed1fe6b64 (patch) | |
tree | 87e5046442ad95239c1f60982a191ceb1d8b1c9f | |
parent | f96c3d0d025ad37836eb908d561b0c607a1f7b5e (diff) | |
download | fuzzycat-5a307829670888fedd696e6220c84feed1fe6b64.tar.gz fuzzycat-5a307829670888fedd696e6220c84feed1fe6b64.zip |
stub tool: fuzzycat-issn to generate test data
currently: fuzzycat-issn --make-pairs will generate a TSV with (issn, a, b) example, e.g.
...
0011-9717 Detskaâ literatura. Детская литература.
0011-9717 Detskaâ literatura. Detskaâ literatura
0011-9717 Детская литература. Detskaâ literatura
0011-6637 Darbininkas. Darbininkas
0012-0820 deutsche Tabakbau deutsche Tabakbau.
0011-5444 Daily Kent stater. Daily Kent stater
...
The idea is that these names per definition denote the same journal. We
might even have a fixed lookup table, since some variants involve
multiple scripts (and there are only around 2M names in total).
Currently 1992176 pairs can be generated.
-rw-r--r-- | fuzzycat/issn.py | 69 |
1 files changed, 69 insertions, 0 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py new file mode 100644 index 0000000..8108885 --- /dev/null +++ b/fuzzycat/issn.py @@ -0,0 +1,69 @@ +""" +Munge the ISSN data so we get some container name test data out of it. + + ... + "issn": "0000-0019", + "mainTitle": "<U+0098>The <U+009C>publishers weekly.", + "name": [ + "<U+0098>The <U+009C>publishers weekly.", + "Publishers weekly" + ], + +Public data from ISSN via: +https://portal.issn.org/resource/ISSN/0874-2308?format=json, and issnlister +(https://github.com/miku/issnlister) to aggregate. + +The dataset contains naming variants in "name". + +Q1: How many of these variants would our matching algorithm detect? + +For that, we need a dataset that generates pairs (a, b) from all names (the +mainTitle is just one of the name). + +""" + +import argparse +import sys +import os +import json +import itertools + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("file", + default=sys.stdin, + type=argparse.FileType("r"), + help="public data from issn, one JSON object per line") + parser.add_argument("--make-pairs", action="store_true") + + args = parser.parse_args() + + if args.make_pairs: + for line in args.file: + line = line.strip() + try: + doc = json.loads(line) + except json.decoder.JSONDecodeError as exc: + print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr) + continue + for item in doc.get("@graph", []): + issn = item.get("issn", "") + if not issn: + continue + if len(issn) != 9: + continue + if issn[4] != "-": + continue + names = item.get("name") + if not names: + continue + if isinstance(names, str): + names = [names] + if not isinstance(names, list): + raise ValueError("expected a list: {} {}".format(names, type(names))) + if len(names) < 2: + continue + + for a, b in itertools.combinations(names, 2): + print("{}\t{}\t{}".format(issn, a, b)) |