diff options
-rw-r--r-- | fuzzycat/issn.py | 103 | ||||
-rw-r--r-- | fuzzycat/utils.py | 16 |
2 files changed, 88 insertions, 31 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py index 8108885..55204db 100644 --- a/fuzzycat/issn.py +++ b/fuzzycat/issn.py @@ -23,10 +23,65 @@ mainTitle is just one of the name). """ import argparse -import sys -import os -import json +import collections import itertools +import json +import os +import sys +from typing import Iterable + +from fuzzycat.utils import SetEncoder + + +def generate_name_pairs(lines: Iterable): + """ + Given JSON lines, yield a tuple (issn, a, b) of test data. Will skip on + errors. + """ + for line in lines: + line = line.strip() + try: + doc = json.loads(line) + except json.decoder.JSONDecodeError as exc: + print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr) + continue + for item in doc.get("@graph", []): + issn = item.get("issn", "") + if not issn: + continue + if len(issn) != 9: + continue + if issn[4] != "-": + continue + names = item.get("name") + if not names: + continue + if isinstance(names, str): + names = [names] + if not isinstance(names, list): + raise ValueError("expected a list: {} {}".format(names, type(names))) + if len(names) < 2: + continue + + # Some names contain whitespace in the database, " Mystery & + # detection annual" -- control character prefixes (e.g. C2 98) + # remain. + names = [s.strip() for s in names] + + for a, b in itertools.combinations(names, 2): + yield (issn, a, b) + + +def generate_name_issn_mapping(lines: Iterable): + """ + Given JSON lines, generate a dictionary mapping names sets of ISSN. Names + might be reused. + """ + mapping = collections.defaultdict(set) + for issn, a, b in generate_name_pairs(lines): + mapping[a].add(issn) + mapping[b].add(issn) + return mapping def main(): @@ -35,35 +90,21 @@ def main(): default=sys.stdin, type=argparse.FileType("r"), help="public data from issn, one JSON object per line") - parser.add_argument("--make-pairs", action="store_true") + parser.add_argument("--make-pairs", + action="store_true", + help="generate TSV and write to stdout") + parser.add_argument("--make-mapping", + action="store_true", + help="generate JSON mapping from name to list of ISSN") + parser.add_argument("--make-module", + action="store_true", + help="generate Python lookup table module and write to stdout") args = parser.parse_args() + if args.make_mapping: + print(json.dumps(generate_name_issn_mapping(args.file), cls=SetEncoder)) + if args.make_pairs: - for line in args.file: - line = line.strip() - try: - doc = json.loads(line) - except json.decoder.JSONDecodeError as exc: - print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr) - continue - for item in doc.get("@graph", []): - issn = item.get("issn", "") - if not issn: - continue - if len(issn) != 9: - continue - if issn[4] != "-": - continue - names = item.get("name") - if not names: - continue - if isinstance(names, str): - names = [names] - if not isinstance(names, list): - raise ValueError("expected a list: {} {}".format(names, type(names))) - if len(names) < 2: - continue - - for a, b in itertools.combinations(names, 2): - print("{}\t{}\t{}".format(issn, a, b)) + for issn, a, b in generate_name_pairs(args.file): + print("{}\t{}\t{}".format(issn, a, b)) diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 3a4be99..97125ce 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -2,6 +2,7 @@ import collections import itertools +import json import re import string from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence @@ -10,6 +11,21 @@ A couple of utilities, may be split up into separate modules. """ +class SetEncoder(json.JSONEncoder): + """ + Helper to encode python sets into JSON lists. + So you can write something like this: + json.dumps({"things": set([1, 2, 3])}, cls=SetEncoder) + """ + def default(self, obj): + """ + Decorate call to standard implementation. + """ + if isinstance(obj, set): + return list(obj) + return json.JSONEncoder.default(self, obj) + + class StringPipeline: """ Minimalistic grouping of functions applied on an input string to produce |