aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-08-14 16:55:26 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-08-14 16:55:26 +0200
commit63a0e47b5a3ab0c7a125b5255526d68548a4077f (patch)
treee548d02d80599cb8d14432c5809a58eb3710e198
parent799544a8f488d3b1fb3e33b8e5710a697f0e7e18 (diff)
downloadfuzzycat-63a0e47b5a3ab0c7a125b5255526d68548a4077f.tar.gz
fuzzycat-63a0e47b5a3ab0c7a125b5255526d68548a4077f.zip
add de-jsonld flag
-rw-r--r--fuzzycat/issn.py72
1 files changed, 57 insertions, 15 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
index e87cbb5..747618f 100644
--- a/fuzzycat/issn.py
+++ b/fuzzycat/issn.py
@@ -165,11 +165,21 @@ import json
import os
import sys
import re
-from typing import Iterable, Dict
+from typing import Iterable, Dict, List, Union
from fuzzycat.utils import SetEncoder
+def listify(v : Union[str, List[str]]) -> List[str]:
+ """
+ Sensible create a list.
+ """
+ if v is None:
+ return []
+ if isinstance(v, str):
+ return [v]
+ return v
+
def jsonld_minimal(v: Dict) -> Dict:
"""
Turn a JSON from issn.org into a smaller dict with a few core fields.
@@ -189,36 +199,65 @@ def jsonld_minimal(v: Dict) -> Dict:
id = item.get("@id")
if not id:
continue
+
+ # ISSN-L
match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", id)
if match:
doc["issnl"] = match.group(1)
continue
+
+ # The "main" issn entry.
match = re.match(r"^resource/ISSN/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", id)
if match:
- # Collect ids.
+ # if we do not have ISSN-L yet, check "exampleOfWork",
+ # "resource/ISSN/2658-0705"
+ if not "issnl" in doc:
+ match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", item.get("exampleOfWork", ""))
+ if match:
+ doc["issnl"] = match.group(1)
+
+ # note material
+ doc["material"] = listify(item.get("material", []))
+
+ # collect ids
issns = set([match.group(1)])
if item.get("identifier"):
issns.add(item.get("identifier"))
if item.get("issn"):
issns.add(item.get("issn"))
doc["issns"] = issns
+ # add urls
+ doc["urls"] = listify(item.get("url", []))
+ # add names, variants
+ names = listify(item.get("name")) + listify(item.get("alternateName"))
+ doc["names"] = list(set(names))
- names = item.get("name")
- if isinstance(names, str):
- names = [names]
- if isinstance(names, list):
- doc["names"] = names
-
- isFormatOf = item.get("isFormatOf", [])
- if isinstance(isFormatOf, str):
- isFormatOf = [isFormatOf]
-
- for v in isFormatOf:
+ # add related issn
+ for v in listify(item.get("isFormatOf", [])):
match = re.match(r"^resource/ISSN/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", v)
if match:
doc["issns"].add(match.group(1))
+
+ if "issnl" not in doc:
+ raise ValueError("entry without issnl: {}".format(item))
+
return doc
+def de_jsonld(lines: Iterable):
+ """
+ Convert to a minimal JSON format.
+ """
+ for line in lines:
+ line = line.strip()
+ try:
+ doc = json.loads(line)
+ doc = jsonld_minimal(doc)
+ except json.decoder.JSONDecodeError as exc:
+ print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
+ continue
+ else:
+ print(json.dumps(doc, cls=SetEncoder))
+
def generate_name_pairs(lines: Iterable):
"""
Given JSON lines, yield a tuple (issn, a, b) of test data. Will skip on
@@ -261,9 +300,9 @@ def main():
parser.add_argument("--make-mapping",
action="store_true",
help="generate JSON mapping from name to list of ISSN")
- parser.add_argument("--make-module",
+ parser.add_argument("--de-jsonld",
action="store_true",
- help="generate Python lookup table module and write to stdout")
+ help="break up the jsonld")
args = parser.parse_args()
@@ -273,3 +312,6 @@ def main():
if args.make_pairs:
for issn, a, b in generate_name_pairs(args.file):
print("{}\t{}\t{}".format(issn, a, b))
+
+ if args.de_jsonld:
+ de_jsonld(args.file)