aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-08-13 15:01:10 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-08-13 15:01:10 +0200
commit799544a8f488d3b1fb3e33b8e5710a697f0e7e18 (patch)
treebb07284e42bb6829da2f6eaeb7943637c79448a1
parente303e1bb3e2bd2e66e5d225628aa2768c42cdeed (diff)
downloadfuzzycat-799544a8f488d3b1fb3e33b8e5710a697f0e7e18.tar.gz
fuzzycat-799544a8f488d3b1fb3e33b8e5710a697f0e7e18.zip
issn: jsonld breakup
-rw-r--r--fuzzycat/issn.py215
1 files changed, 190 insertions, 25 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
index 55204db..e87cbb5 100644
--- a/fuzzycat/issn.py
+++ b/fuzzycat/issn.py
@@ -20,6 +20,142 @@ Q1: How many of these variants would our matching algorithm detect?
For that, we need a dataset that generates pairs (a, b) from all names (the
mainTitle is just one of the name).
+Example JSON LD response from ISSN:
+
+{
+ "@context": {
+ "format": {
+ "@id": "http://purl.org/dc/elements/1.1/format",
+ "@type": "@id"
+ },
+ "identifiedBy": {
+ "@id": "http://id.loc.gov/ontologies/bibframe/identifiedBy",
+ "@type": "@id"
+ },
+ "identifier": {
+ "@id": "http://purl.org/dc/elements/1.1/identifier"
+ },
+ "isPartOf": {
+ "@id": "http://schema.org/isPartOf",
+ "@type": "@id"
+ },
+ "issn": {
+ "@id": "http://purl.org/ontology/bibo/issn"
+ },
+ "label": {
+ "@id": "http://www.w3.org/2000/01/rdf-schema#label"
+ },
+ "location": {
+ "@id": "http://schema.org/location",
+ "@type": "@id"
+ },
+ "mainEntity": {
+ "@id": "http://schema.org/mainEntity",
+ "@type": "@id"
+ },
+ "modified": {
+ "@id": "http://purl.org/dc/terms/modified",
+ "@type": "http://www.w3.org/2001/XMLSchema#dateTime"
+ },
+ "name": {
+ "@id": "http://schema.org/name"
+ },
+ "publication": {
+ "@id": "http://schema.org/publication",
+ "@type": "@id"
+ },
+ "status": {
+ "@id": "http://id.loc.gov/ontologies/bibframe/status",
+ "@type": "@id"
+ },
+ "title": {
+ "@id": "http://id.loc.gov/ontologies/bibframe/title",
+ "@type": "@id"
+ },
+ "type": {
+ "@id": "http://purl.org/dc/terms/type",
+ "@type": "@id"
+ },
+ "value": {
+ "@id": "http://www.w3.org/1999/02/22-rdf-syntax-ns#value"
+ },
+ "wasAttributedTo": {
+ "@id": "http://www.w3.org/ns/prov#wasAttributedTo",
+ "@type": "@id"
+ }
+ },
+ "@graph": [
+ {
+ "@id": "http://id.loc.gov/vocabulary/countries/pl",
+ "label": "Poland"
+ },
+ {
+ "@id": "organization/ISSNCenter#57",
+ "@type": "http://schema.org/Organization"
+ },
+ {
+ "@id": "resource/ISSN-L/0001-4125",
+ "identifiedBy": "resource/ISSN/0001-4125#ISSN-L"
+ },
+ {
+ "@id": "resource/ISSN/0001-4125",
+ "@type": [
+ "http://schema.org/Periodical",
+ "http://id.loc.gov/ontologies/bibframe/Instance",
+ "http://id.loc.gov/ontologies/bibframe/Work"
+ ],
+ "format": "vocabularies/medium#Print",
+ "http://schema.org/issn": "0001-4125",
+ "identifiedBy": [
+ "resource/ISSN/0001-4125#ISSN-L",
+ "resource/ISSN/0001-4125#KeyTitle",
+ "resource/ISSN/0001-4125#ISSN"
+ ],
+ "identifier": "0001-4125",
+ "isPartOf": "resource/ISSN-L/0001-4125",
+ "issn": "0001-4125",
+ "name": "Bulletin de l'Académie Polonaise des Sciences. Série des Sciences Techniques",
+ "publication": "resource/ISSN/0001-4125#ReferencePublicationEvent",
+ "title": "resource/ISSN/0001-4125#KeyTitle",
+ "type": "http://marc21rdf.info/terms/formofmaterial#a"
+ },
+ {
+ "@id": "resource/ISSN/0001-4125#ISSN",
+ "@type": "http://id.loc.gov/ontologies/bibframe/Issn",
+ "status": "vocabularies/IdentifierStatus#Valid",
+ "value": "0001-4125"
+ },
+ {
+ "@id": "resource/ISSN/0001-4125#ISSN-L",
+ "@type": "http://id.loc.gov/ontologies/bibframe/IssnL",
+ "status": "vocabularies/IdentifierStatus#Valid",
+ "value": "0001-4125"
+ },
+ {
+ "@id": "resource/ISSN/0001-4125#KeyTitle",
+ "@type": [
+ "http://id.loc.gov/ontologies/bibframe/Identifier",
+ "http://id.loc.gov/ontologies/bibframe/KeyTitle"
+ ],
+ "value": "Bulletin de l'Académie Polonaise des Sciences. Série des Sciences Techniques"
+ },
+ {
+ "@id": "resource/ISSN/0001-4125#Record",
+ "@type": "http://schema.org/CreativeWork",
+ "mainEntity": "resource/ISSN/0001-4125",
+ "modified": "20051223105700.0",
+ "status": "vocabularies/RecordStatus#Register",
+ "wasAttributedTo": "organization/ISSNCenter#57"
+ },
+ {
+ "@id": "resource/ISSN/0001-4125#ReferencePublicationEvent",
+ "@type": "http://schema.org/PublicationEvent",
+ "location": "http://id.loc.gov/vocabulary/countries/pl"
+ }
+ ]
+}
+
+
"""
import argparse
@@ -28,11 +164,61 @@ import itertools
import json
import os
import sys
-from typing import Iterable
+import re
+from typing import Iterable, Dict
from fuzzycat.utils import SetEncoder
+def jsonld_minimal(v: Dict) -> Dict:
+ """
+ Turn a JSON from issn.org into a smaller dict with a few core fields.
+
+ Example result: {'issnl': '0008-2554', 'issns': {'0008-2554'}, 'names':
+ ['Canada agriculture (Ottawa)', 'Canada agriculture.']}
+ """
+ items = v.get("@graph")
+ if not items:
+ return {}
+ doc = {}
+ for item in items:
+ pass
+ # "@id": "resource/ISSN-L/0001-4125"
+ # "@id": "resource/ISSN/0001-4125"
+ # ...
+ id = item.get("@id")
+ if not id:
+ continue
+ match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", id)
+ if match:
+ doc["issnl"] = match.group(1)
+ continue
+ match = re.match(r"^resource/ISSN/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", id)
+ if match:
+ # Collect ids.
+ issns = set([match.group(1)])
+ if item.get("identifier"):
+ issns.add(item.get("identifier"))
+ if item.get("issn"):
+ issns.add(item.get("issn"))
+ doc["issns"] = issns
+
+ names = item.get("name")
+ if isinstance(names, str):
+ names = [names]
+ if isinstance(names, list):
+ doc["names"] = names
+
+ isFormatOf = item.get("isFormatOf", [])
+ if isinstance(isFormatOf, str):
+ isFormatOf = [isFormatOf]
+
+ for v in isFormatOf:
+ match = re.match(r"^resource/ISSN/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", v)
+ if match:
+ doc["issns"].add(match.group(1))
+ return doc
+
def generate_name_pairs(lines: Iterable):
"""
Given JSON lines, yield a tuple (issn, a, b) of test data. Will skip on
@@ -42,33 +228,12 @@ def generate_name_pairs(lines: Iterable):
line = line.strip()
try:
doc = json.loads(line)
+ doc = jsonld_minimal(doc)
except json.decoder.JSONDecodeError as exc:
print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
continue
- for item in doc.get("@graph", []):
- issn = item.get("issn", "")
- if not issn:
- continue
- if len(issn) != 9:
- continue
- if issn[4] != "-":
- continue
- names = item.get("name")
- if not names:
- continue
- if isinstance(names, str):
- names = [names]
- if not isinstance(names, list):
- raise ValueError("expected a list: {} {}".format(names, type(names)))
- if len(names) < 2:
- continue
-
- # Some names contain whitespace in the database, " Mystery &
- # detection annual" -- control character prefixes (e.g. C2 98)
- # remain.
- names = [s.strip() for s in names]
-
- for a, b in itertools.combinations(names, 2):
+ for issn in doc.get("issns", []):
+ for a, b in itertools.combinations(doc.get("names", []), 2):
yield (issn, a, b)