aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-08-14 17:10:49 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-08-14 17:10:49 +0200
commit25c1858a1a93681fe9fefa5dc19a2b9b7ed0154a (patch)
treeb3b5f1a6f9fe742fa9ba4948c17251de4e48f83a
parent370c4ad75694f3e4346660038fe91ef732652b70 (diff)
downloadfuzzycat-25c1858a1a93681fe9fefa5dc19a2b9b7ed0154a.tar.gz
fuzzycat-25c1858a1a93681fe9fefa5dc19a2b9b7ed0154a.zip
issn: pair with issnl
-rw-r--r--fuzzycat/issn.py45
1 files changed, 26 insertions, 19 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
index 747618f..d95fa9a 100644
--- a/fuzzycat/issn.py
+++ b/fuzzycat/issn.py
@@ -155,7 +155,6 @@ Example JSON LD response from ISSN:
]
}
-
"""
import argparse
@@ -163,14 +162,14 @@ import collections
import itertools
import json
import os
-import sys
import re
-from typing import Iterable, Dict, List, Union
+import sys
+from typing import Dict, Iterable, List, Union
from fuzzycat.utils import SetEncoder
-def listify(v : Union[str, List[str]]) -> List[str]:
+def listify(v: Union[str, List[str]]) -> List[str]:
"""
Sensible create a list.
"""
@@ -180,6 +179,7 @@ def listify(v : Union[str, List[str]]) -> List[str]:
return [v]
return v
+
def jsonld_minimal(v: Dict) -> Dict:
"""
Turn a JSON from issn.org into a smaller dict with a few core fields.
@@ -212,7 +212,8 @@ def jsonld_minimal(v: Dict) -> Dict:
# if we do not have ISSN-L yet, check "exampleOfWork",
# "resource/ISSN/2658-0705"
if not "issnl" in doc:
- match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", item.get("exampleOfWork", ""))
+ match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$",
+ item.get("exampleOfWork", ""))
if match:
doc["issnl"] = match.group(1)
@@ -243,6 +244,7 @@ def jsonld_minimal(v: Dict) -> Dict:
return doc
+
def de_jsonld(lines: Iterable):
"""
Convert to a minimal JSON format.
@@ -250,30 +252,37 @@ def de_jsonld(lines: Iterable):
for line in lines:
line = line.strip()
try:
- doc = json.loads(line)
- doc = jsonld_minimal(doc)
+ doc = jsonld_minimal(json.loads(line))
except json.decoder.JSONDecodeError as exc:
print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
continue
else:
print(json.dumps(doc, cls=SetEncoder))
+
def generate_name_pairs(lines: Iterable):
"""
- Given JSON lines, yield a tuple (issn, a, b) of test data. Will skip on
+ Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on
errors.
+
+ Example output:
+
+ 0013-211X Eendracht-bode (Tholen) Eendracht-bode.
+ 0012-7388 Dynamic maturity Dynamic maturity.
+ 0012-6055 Drehpunkt. Drehpunkt (Basel. 1968)
+
+ Basically, these would be free test cases, since we would like to report "match" on most of these.
+
"""
for line in lines:
line = line.strip()
try:
- doc = json.loads(line)
- doc = jsonld_minimal(doc)
+ doc = jsonld_minimal(json.loads(line))
except json.decoder.JSONDecodeError as exc:
print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
continue
- for issn in doc.get("issns", []):
- for a, b in itertools.combinations(doc.get("names", []), 2):
- yield (issn, a, b)
+ for a, b in itertools.combinations(doc.get("names", []), 2):
+ yield (doc["issnl"], a, b)
def generate_name_issn_mapping(lines: Iterable):
@@ -282,9 +291,9 @@ def generate_name_issn_mapping(lines: Iterable):
might be reused.
"""
mapping = collections.defaultdict(set)
- for issn, a, b in generate_name_pairs(lines):
- mapping[a].add(issn)
- mapping[b].add(issn)
+ for issnl, a, b in generate_name_pairs(lines):
+ mapping[a].add(issnl)
+ mapping[b].add(issnl)
return mapping
@@ -300,9 +309,7 @@ def main():
parser.add_argument("--make-mapping",
action="store_true",
help="generate JSON mapping from name to list of ISSN")
- parser.add_argument("--de-jsonld",
- action="store_true",
- help="break up the jsonld")
+ parser.add_argument("--de-jsonld", action="store_true", help="break up the jsonld")
args = parser.parse_args()