issn: jsonld breakup

author: Martin Czygan <martin.czygan@gmail.com> 2020-08-13 15:01:10 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2020-08-13 15:01:10 +0200
commit: 799544a8f488d3b1fb3e33b8e5710a697f0e7e18 (patch)
tree: bb07284e42bb6829da2f6eaeb7943637c79448a1
parent: e303e1bb3e2bd2e66e5d225628aa2768c42cdeed (diff)
download: fuzzycat-799544a8f488d3b1fb3e33b8e5710a697f0e7e18.tar.gz
fuzzycat-799544a8f488d3b1fb3e33b8e5710a697f0e7e18.zip
1 files changed, 190 insertions, 25 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
index 55204db..e87cbb5 100644
--- a/fuzzycat/issn.py
+++ b/fuzzycat/issn.py
@@ -20,6 +20,142 @@ Q1: How many of these variants would our matching algorithm detect?
 For that, we need a dataset that generates pairs (a, b) from all names (the
 mainTitle is just one of the name).
 
+Example JSON LD response from ISSN:
+
+{
+  "@context": {
+    "format": {
+      "@id": "http://purl.org/dc/elements/1.1/format",
+      "@type": "@id"
+    },
+    "identifiedBy": {
+      "@id": "http://id.loc.gov/ontologies/bibframe/identifiedBy",
+      "@type": "@id"
+    },
+    "identifier": {
+      "@id": "http://purl.org/dc/elements/1.1/identifier"
+    },
+    "isPartOf": {
+      "@id": "http://schema.org/isPartOf",
+      "@type": "@id"
+    },
+    "issn": {
+      "@id": "http://purl.org/ontology/bibo/issn"
+    },
+    "label": {
+      "@id": "http://www.w3.org/2000/01/rdf-schema#label"
+    },
+    "location": {
+      "@id": "http://schema.org/location",
+      "@type": "@id"
+    },
+    "mainEntity": {
+      "@id": "http://schema.org/mainEntity",
+      "@type": "@id"
+    },
+    "modified": {
+      "@id": "http://purl.org/dc/terms/modified",
+      "@type": "http://www.w3.org/2001/XMLSchema#dateTime"
+    },
+    "name": {
+      "@id": "http://schema.org/name"
+    },
+    "publication": {
+      "@id": "http://schema.org/publication",
+      "@type": "@id"
+    },
+    "status": {
+      "@id": "http://id.loc.gov/ontologies/bibframe/status",
+      "@type": "@id"
+    },
+    "title": {
+      "@id": "http://id.loc.gov/ontologies/bibframe/title",
+      "@type": "@id"
+    },
+    "type": {
+      "@id": "http://purl.org/dc/terms/type",
+      "@type": "@id"
+    },
+    "value": {
+      "@id": "http://www.w3.org/1999/02/22-rdf-syntax-ns#value"
+    },
+    "wasAttributedTo": {
+      "@id": "http://www.w3.org/ns/prov#wasAttributedTo",
+      "@type": "@id"
+    }
+  },
+  "@graph": [
+    {
+      "@id": "http://id.loc.gov/vocabulary/countries/pl",
+      "label": "Poland"
+    },
+    {
+      "@id": "organization/ISSNCenter#57",
+      "@type": "http://schema.org/Organization"
+    },
+    {
+      "@id": "resource/ISSN-L/0001-4125",
+      "identifiedBy": "resource/ISSN/0001-4125#ISSN-L"
+    },
+    {
+      "@id": "resource/ISSN/0001-4125",
+      "@type": [
+        "http://schema.org/Periodical",
+        "http://id.loc.gov/ontologies/bibframe/Instance",
+        "http://id.loc.gov/ontologies/bibframe/Work"
+      ],
+      "format": "vocabularies/medium#Print",
+      "http://schema.org/issn": "0001-4125",
+      "identifiedBy": [
+        "resource/ISSN/0001-4125#ISSN-L",
+        "resource/ISSN/0001-4125#KeyTitle",
+        "resource/ISSN/0001-4125#ISSN"
+      ],
+      "identifier": "0001-4125",
+      "isPartOf": "resource/ISSN-L/0001-4125",
+      "issn": "0001-4125",
+      "name": "Bulletin de l'Académie Polonaise des Sciences. Série des Sciences Techniques",
+      "publication": "resource/ISSN/0001-4125#ReferencePublicationEvent",
+      "title": "resource/ISSN/0001-4125#KeyTitle",
+      "type": "http://marc21rdf.info/terms/formofmaterial#a"
+    },
+    {
+      "@id": "resource/ISSN/0001-4125#ISSN",
+      "@type": "http://id.loc.gov/ontologies/bibframe/Issn",
+      "status": "vocabularies/IdentifierStatus#Valid",
+      "value": "0001-4125"
+    },
+    {
+      "@id": "resource/ISSN/0001-4125#ISSN-L",
+      "@type": "http://id.loc.gov/ontologies/bibframe/IssnL",
+      "status": "vocabularies/IdentifierStatus#Valid",
+      "value": "0001-4125"
+    },
+    {
+      "@id": "resource/ISSN/0001-4125#KeyTitle",
+      "@type": [
+        "http://id.loc.gov/ontologies/bibframe/Identifier",
+        "http://id.loc.gov/ontologies/bibframe/KeyTitle"
+      ],
+      "value": "Bulletin de l'Académie Polonaise des Sciences. Série des Sciences Techniques"
+    },
+    {
+      "@id": "resource/ISSN/0001-4125#Record",
+      "@type": "http://schema.org/CreativeWork",
+      "mainEntity": "resource/ISSN/0001-4125",
+      "modified": "20051223105700.0",
+      "status": "vocabularies/RecordStatus#Register",
+      "wasAttributedTo": "organization/ISSNCenter#57"
+    },
+    {
+      "@id": "resource/ISSN/0001-4125#ReferencePublicationEvent",
+      "@type": "http://schema.org/PublicationEvent",
+      "location": "http://id.loc.gov/vocabulary/countries/pl"
+    }
+  ]
+}
+
+
 """
 
 import argparse
@@ -28,11 +164,61 @@ import itertools
 import json
 import os
 import sys
-from typing import Iterable
+import re
+from typing import Iterable, Dict
 
 from fuzzycat.utils import SetEncoder
 
 
+def jsonld_minimal(v: Dict) -> Dict:
+    """
+    Turn a JSON from issn.org into a smaller dict with a few core fields.
+
+    Example result: {'issnl': '0008-2554', 'issns': {'0008-2554'}, 'names':
+        ['Canada agriculture (Ottawa)', 'Canada agriculture.']}
+    """
+    items = v.get("@graph")
+    if not items:
+        return {}
+    doc = {}
+    for item in items:
+        pass
+        # "@id": "resource/ISSN-L/0001-4125"
+        # "@id": "resource/ISSN/0001-4125"
+        # ...
+        id = item.get("@id")
+        if not id:
+            continue
+        match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", id)
+        if match:
+            doc["issnl"] = match.group(1)
+            continue
+        match = re.match(r"^resource/ISSN/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", id)
+        if match:
+            # Collect ids.
+            issns = set([match.group(1)])
+            if item.get("identifier"):
+                issns.add(item.get("identifier"))
+            if item.get("issn"):
+                issns.add(item.get("issn"))
+            doc["issns"] = issns
+
+            names = item.get("name")
+            if isinstance(names, str):
+                names = [names]
+            if isinstance(names, list):
+                doc["names"] = names
+
+            isFormatOf = item.get("isFormatOf", [])
+            if isinstance(isFormatOf, str):
+                isFormatOf = [isFormatOf]
+
+            for v in isFormatOf:
+                match = re.match(r"^resource/ISSN/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", v)
+                if match:
+                    doc["issns"].add(match.group(1))
+    return doc
+
 def generate_name_pairs(lines: Iterable):
     """
     Given JSON lines, yield a tuple (issn, a, b) of test data. Will skip on
@@ -42,33 +228,12 @@ def generate_name_pairs(lines: Iterable):
         line = line.strip()
         try:
             doc = json.loads(line)
+            doc = jsonld_minimal(doc)
         except json.decoder.JSONDecodeError as exc:
             print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
             continue
-        for item in doc.get("@graph", []):
-            issn = item.get("issn", "")
-            if not issn:
-                continue
-            if len(issn) != 9:
-                continue
-            if issn[4] != "-":
-                continue
-            names = item.get("name")
-            if not names:
-                continue
-            if isinstance(names, str):
-                names = [names]
-            if not isinstance(names, list):
-                raise ValueError("expected a list: {} {}".format(names, type(names)))
-            if len(names) < 2:
-                continue
-
-            # Some names contain whitespace in the database, "      Mystery &
-            # detection annual" -- control character prefixes (e.g. C2 98)
-            # remain.
-            names = [s.strip() for s in names]
-
-            for a, b in itertools.combinations(names, 2):
+        for issn in doc.get("issns", []):
+            for a, b in itertools.combinations(doc.get("names", []), 2):
                 yield (issn, a, b)
author	Martin Czygan <martin.czygan@gmail.com>	2020-08-13 15:01:10 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2020-08-13 15:01:10 +0200
commit	799544a8f488d3b1fb3e33b8e5710a697f0e7e18 (patch)
tree	bb07284e42bb6829da2f6eaeb7943637c79448a1
parent	e303e1bb3e2bd2e66e5d225628aa2768c42cdeed (diff)
download	fuzzycat-799544a8f488d3b1fb3e33b8e5710a697f0e7e18.tar.gz fuzzycat-799544a8f488d3b1fb3e33b8e5710a697f0e7e18.zip