add de-jsonld flag

author: Martin Czygan <martin.czygan@gmail.com> 2020-08-14 16:55:26 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2020-08-14 16:55:26 +0200
commit: 63a0e47b5a3ab0c7a125b5255526d68548a4077f (patch)
tree: e548d02d80599cb8d14432c5809a58eb3710e198
parent: 799544a8f488d3b1fb3e33b8e5710a697f0e7e18 (diff)
download: fuzzycat-63a0e47b5a3ab0c7a125b5255526d68548a4077f.tar.gz
fuzzycat-63a0e47b5a3ab0c7a125b5255526d68548a4077f.zip
1 files changed, 57 insertions, 15 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
index e87cbb5..747618f 100644
--- a/fuzzycat/issn.py
+++ b/fuzzycat/issn.py
@@ -165,11 +165,21 @@ import json
 import os
 import sys
 import re
-from typing import Iterable, Dict
+from typing import Iterable, Dict, List, Union
 
 from fuzzycat.utils import SetEncoder
 
 
+def listify(v : Union[str, List[str]]) -> List[str]:
+    """
+    Sensible create a list.
+    """
+    if v is None:
+        return []
+    if isinstance(v, str):
+        return [v]
+    return v
+
 def jsonld_minimal(v: Dict) -> Dict:
     """
     Turn a JSON from issn.org into a smaller dict with a few core fields.
@@ -189,36 +199,65 @@ def jsonld_minimal(v: Dict) -> Dict:
         id = item.get("@id")
         if not id:
             continue
+
+        # ISSN-L
         match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", id)
         if match:
             doc["issnl"] = match.group(1)
             continue
+
+        # The "main" issn entry.
         match = re.match(r"^resource/ISSN/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", id)
         if match:
-            # Collect ids.
+            # if we do not have ISSN-L yet, check "exampleOfWork",
+            # "resource/ISSN/2658-0705"
+            if not "issnl" in doc:
+                match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", item.get("exampleOfWork", ""))
+                if match:
+                    doc["issnl"] = match.group(1)
+
+            # note material
+            doc["material"] = listify(item.get("material", []))
+
+            # collect ids
             issns = set([match.group(1)])
             if item.get("identifier"):
                 issns.add(item.get("identifier"))
             if item.get("issn"):
                 issns.add(item.get("issn"))
             doc["issns"] = issns
+            # add urls
+            doc["urls"] = listify(item.get("url", []))
+            # add names, variants
+            names = listify(item.get("name")) + listify(item.get("alternateName"))
+            doc["names"] = list(set(names))
 
-            names = item.get("name")
-            if isinstance(names, str):
-                names = [names]
-            if isinstance(names, list):
-                doc["names"] = names
-
-            isFormatOf = item.get("isFormatOf", [])
-            if isinstance(isFormatOf, str):
-                isFormatOf = [isFormatOf]
-
-            for v in isFormatOf:
+            # add related issn
+            for v in listify(item.get("isFormatOf", [])):
                 match = re.match(r"^resource/ISSN/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", v)
                 if match:
                     doc["issns"].add(match.group(1))
+
+    if "issnl" not in doc:
+        raise ValueError("entry without issnl: {}".format(item))
+
     return doc
 
+def de_jsonld(lines: Iterable):
+    """
+    Convert to a minimal JSON format.
+    """
+    for line in lines:
+        line = line.strip()
+        try:
+            doc = json.loads(line)
+            doc = jsonld_minimal(doc)
+        except json.decoder.JSONDecodeError as exc:
+            print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
+            continue
+        else:
+            print(json.dumps(doc, cls=SetEncoder))
+
 def generate_name_pairs(lines: Iterable):
     """
     Given JSON lines, yield a tuple (issn, a, b) of test data. Will skip on
@@ -261,9 +300,9 @@ def main():
     parser.add_argument("--make-mapping",
                         action="store_true",
                         help="generate JSON mapping from name to list of ISSN")
-    parser.add_argument("--make-module",
+    parser.add_argument("--de-jsonld",
                         action="store_true",
-                        help="generate Python lookup table module and write to stdout")
+                        help="break up the jsonld")
 
     args = parser.parse_args()
 
@@ -273,3 +312,6 @@ def main():
     if args.make_pairs:
         for issn, a, b in generate_name_pairs(args.file):
             print("{}\t{}\t{}".format(issn, a, b))
+
+    if args.de_jsonld:
+        de_jsonld(args.file)
author	Martin Czygan <martin.czygan@gmail.com>	2020-08-14 16:55:26 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2020-08-14 16:55:26 +0200
commit	63a0e47b5a3ab0c7a125b5255526d68548a4077f (patch)
tree	e548d02d80599cb8d14432c5809a58eb3710e198
parent	799544a8f488d3b1fb3e33b8e5710a697f0e7e18 (diff)
download	fuzzycat-63a0e47b5a3ab0c7a125b5255526d68548a4077f.tar.gz fuzzycat-63a0e47b5a3ab0c7a125b5255526d68548a4077f.zip