aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/issn.py103
-rw-r--r--fuzzycat/utils.py16
2 files changed, 88 insertions, 31 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
index 8108885..55204db 100644
--- a/fuzzycat/issn.py
+++ b/fuzzycat/issn.py
@@ -23,10 +23,65 @@ mainTitle is just one of the name).
"""
import argparse
-import sys
-import os
-import json
+import collections
import itertools
+import json
+import os
+import sys
+from typing import Iterable
+
+from fuzzycat.utils import SetEncoder
+
+
+def generate_name_pairs(lines: Iterable):
+ """
+ Given JSON lines, yield a tuple (issn, a, b) of test data. Will skip on
+ errors.
+ """
+ for line in lines:
+ line = line.strip()
+ try:
+ doc = json.loads(line)
+ except json.decoder.JSONDecodeError as exc:
+ print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
+ continue
+ for item in doc.get("@graph", []):
+ issn = item.get("issn", "")
+ if not issn:
+ continue
+ if len(issn) != 9:
+ continue
+ if issn[4] != "-":
+ continue
+ names = item.get("name")
+ if not names:
+ continue
+ if isinstance(names, str):
+ names = [names]
+ if not isinstance(names, list):
+ raise ValueError("expected a list: {} {}".format(names, type(names)))
+ if len(names) < 2:
+ continue
+
+ # Some names contain whitespace in the database, " Mystery &
+ # detection annual" -- control character prefixes (e.g. C2 98)
+ # remain.
+ names = [s.strip() for s in names]
+
+ for a, b in itertools.combinations(names, 2):
+ yield (issn, a, b)
+
+
+def generate_name_issn_mapping(lines: Iterable):
+ """
+ Given JSON lines, generate a dictionary mapping names sets of ISSN. Names
+ might be reused.
+ """
+ mapping = collections.defaultdict(set)
+ for issn, a, b in generate_name_pairs(lines):
+ mapping[a].add(issn)
+ mapping[b].add(issn)
+ return mapping
def main():
@@ -35,35 +90,21 @@ def main():
default=sys.stdin,
type=argparse.FileType("r"),
help="public data from issn, one JSON object per line")
- parser.add_argument("--make-pairs", action="store_true")
+ parser.add_argument("--make-pairs",
+ action="store_true",
+ help="generate TSV and write to stdout")
+ parser.add_argument("--make-mapping",
+ action="store_true",
+ help="generate JSON mapping from name to list of ISSN")
+ parser.add_argument("--make-module",
+ action="store_true",
+ help="generate Python lookup table module and write to stdout")
args = parser.parse_args()
+ if args.make_mapping:
+ print(json.dumps(generate_name_issn_mapping(args.file), cls=SetEncoder))
+
if args.make_pairs:
- for line in args.file:
- line = line.strip()
- try:
- doc = json.loads(line)
- except json.decoder.JSONDecodeError as exc:
- print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
- continue
- for item in doc.get("@graph", []):
- issn = item.get("issn", "")
- if not issn:
- continue
- if len(issn) != 9:
- continue
- if issn[4] != "-":
- continue
- names = item.get("name")
- if not names:
- continue
- if isinstance(names, str):
- names = [names]
- if not isinstance(names, list):
- raise ValueError("expected a list: {} {}".format(names, type(names)))
- if len(names) < 2:
- continue
-
- for a, b in itertools.combinations(names, 2):
- print("{}\t{}\t{}".format(issn, a, b))
+ for issn, a, b in generate_name_pairs(args.file):
+ print("{}\t{}\t{}".format(issn, a, b))
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 3a4be99..97125ce 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -2,6 +2,7 @@
import collections
import itertools
+import json
import re
import string
from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence
@@ -10,6 +11,21 @@ A couple of utilities, may be split up into separate modules.
"""
+class SetEncoder(json.JSONEncoder):
+ """
+ Helper to encode python sets into JSON lists.
+ So you can write something like this:
+ json.dumps({"things": set([1, 2, 3])}, cls=SetEncoder)
+ """
+ def default(self, obj):
+ """
+ Decorate call to standard implementation.
+ """
+ if isinstance(obj, set):
+ return list(obj)
+ return json.JSONEncoder.default(self, obj)
+
+
class StringPipeline:
"""
Minimalistic grouping of functions applied on an input string to produce