diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-08-14 17:10:49 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-08-14 17:10:49 +0200 |
commit | 25c1858a1a93681fe9fefa5dc19a2b9b7ed0154a (patch) | |
tree | b3b5f1a6f9fe742fa9ba4948c17251de4e48f83a | |
parent | 370c4ad75694f3e4346660038fe91ef732652b70 (diff) | |
download | fuzzycat-25c1858a1a93681fe9fefa5dc19a2b9b7ed0154a.tar.gz fuzzycat-25c1858a1a93681fe9fefa5dc19a2b9b7ed0154a.zip |
issn: pair with issnl
-rw-r--r-- | fuzzycat/issn.py | 45 |
1 files changed, 26 insertions, 19 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py index 747618f..d95fa9a 100644 --- a/fuzzycat/issn.py +++ b/fuzzycat/issn.py @@ -155,7 +155,6 @@ Example JSON LD response from ISSN: ] } - """ import argparse @@ -163,14 +162,14 @@ import collections import itertools import json import os -import sys import re -from typing import Iterable, Dict, List, Union +import sys +from typing import Dict, Iterable, List, Union from fuzzycat.utils import SetEncoder -def listify(v : Union[str, List[str]]) -> List[str]: +def listify(v: Union[str, List[str]]) -> List[str]: """ Sensible create a list. """ @@ -180,6 +179,7 @@ def listify(v : Union[str, List[str]]) -> List[str]: return [v] return v + def jsonld_minimal(v: Dict) -> Dict: """ Turn a JSON from issn.org into a smaller dict with a few core fields. @@ -212,7 +212,8 @@ def jsonld_minimal(v: Dict) -> Dict: # if we do not have ISSN-L yet, check "exampleOfWork", # "resource/ISSN/2658-0705" if not "issnl" in doc: - match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", item.get("exampleOfWork", "")) + match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", + item.get("exampleOfWork", "")) if match: doc["issnl"] = match.group(1) @@ -243,6 +244,7 @@ def jsonld_minimal(v: Dict) -> Dict: return doc + def de_jsonld(lines: Iterable): """ Convert to a minimal JSON format. @@ -250,30 +252,37 @@ def de_jsonld(lines: Iterable): for line in lines: line = line.strip() try: - doc = json.loads(line) - doc = jsonld_minimal(doc) + doc = jsonld_minimal(json.loads(line)) except json.decoder.JSONDecodeError as exc: print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr) continue else: print(json.dumps(doc, cls=SetEncoder)) + def generate_name_pairs(lines: Iterable): """ - Given JSON lines, yield a tuple (issn, a, b) of test data. Will skip on + Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on errors. + + Example output: + + 0013-211X Eendracht-bode (Tholen) Eendracht-bode. + 0012-7388 Dynamic maturity Dynamic maturity. + 0012-6055 Drehpunkt. Drehpunkt (Basel. 1968) + + Basically, these would be free test cases, since we would like to report "match" on most of these. + """ for line in lines: line = line.strip() try: - doc = json.loads(line) - doc = jsonld_minimal(doc) + doc = jsonld_minimal(json.loads(line)) except json.decoder.JSONDecodeError as exc: print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr) continue - for issn in doc.get("issns", []): - for a, b in itertools.combinations(doc.get("names", []), 2): - yield (issn, a, b) + for a, b in itertools.combinations(doc.get("names", []), 2): + yield (doc["issnl"], a, b) def generate_name_issn_mapping(lines: Iterable): @@ -282,9 +291,9 @@ def generate_name_issn_mapping(lines: Iterable): might be reused. """ mapping = collections.defaultdict(set) - for issn, a, b in generate_name_pairs(lines): - mapping[a].add(issn) - mapping[b].add(issn) + for issnl, a, b in generate_name_pairs(lines): + mapping[a].add(issnl) + mapping[b].add(issnl) return mapping @@ -300,9 +309,7 @@ def main(): parser.add_argument("--make-mapping", action="store_true", help="generate JSON mapping from name to list of ISSN") - parser.add_argument("--de-jsonld", - action="store_true", - help="break up the jsonld") + parser.add_argument("--de-jsonld", action="store_true", help="break up the jsonld") args = parser.parse_args() |