diff options
-rw-r--r-- | fuzzycat/issn.py | 215 |
1 files changed, 190 insertions, 25 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py index 55204db..e87cbb5 100644 --- a/fuzzycat/issn.py +++ b/fuzzycat/issn.py @@ -20,6 +20,142 @@ Q1: How many of these variants would our matching algorithm detect? For that, we need a dataset that generates pairs (a, b) from all names (the mainTitle is just one of the name). +Example JSON LD response from ISSN: + +{ + "@context": { + "format": { + "@id": "http://purl.org/dc/elements/1.1/format", + "@type": "@id" + }, + "identifiedBy": { + "@id": "http://id.loc.gov/ontologies/bibframe/identifiedBy", + "@type": "@id" + }, + "identifier": { + "@id": "http://purl.org/dc/elements/1.1/identifier" + }, + "isPartOf": { + "@id": "http://schema.org/isPartOf", + "@type": "@id" + }, + "issn": { + "@id": "http://purl.org/ontology/bibo/issn" + }, + "label": { + "@id": "http://www.w3.org/2000/01/rdf-schema#label" + }, + "location": { + "@id": "http://schema.org/location", + "@type": "@id" + }, + "mainEntity": { + "@id": "http://schema.org/mainEntity", + "@type": "@id" + }, + "modified": { + "@id": "http://purl.org/dc/terms/modified", + "@type": "http://www.w3.org/2001/XMLSchema#dateTime" + }, + "name": { + "@id": "http://schema.org/name" + }, + "publication": { + "@id": "http://schema.org/publication", + "@type": "@id" + }, + "status": { + "@id": "http://id.loc.gov/ontologies/bibframe/status", + "@type": "@id" + }, + "title": { + "@id": "http://id.loc.gov/ontologies/bibframe/title", + "@type": "@id" + }, + "type": { + "@id": "http://purl.org/dc/terms/type", + "@type": "@id" + }, + "value": { + "@id": "http://www.w3.org/1999/02/22-rdf-syntax-ns#value" + }, + "wasAttributedTo": { + "@id": "http://www.w3.org/ns/prov#wasAttributedTo", + "@type": "@id" + } + }, + "@graph": [ + { + "@id": "http://id.loc.gov/vocabulary/countries/pl", + "label": "Poland" + }, + { + "@id": "organization/ISSNCenter#57", + "@type": "http://schema.org/Organization" + }, + { + "@id": "resource/ISSN-L/0001-4125", + "identifiedBy": "resource/ISSN/0001-4125#ISSN-L" + }, + { + "@id": "resource/ISSN/0001-4125", + "@type": [ + "http://schema.org/Periodical", + "http://id.loc.gov/ontologies/bibframe/Instance", + "http://id.loc.gov/ontologies/bibframe/Work" + ], + "format": "vocabularies/medium#Print", + "http://schema.org/issn": "0001-4125", + "identifiedBy": [ + "resource/ISSN/0001-4125#ISSN-L", + "resource/ISSN/0001-4125#KeyTitle", + "resource/ISSN/0001-4125#ISSN" + ], + "identifier": "0001-4125", + "isPartOf": "resource/ISSN-L/0001-4125", + "issn": "0001-4125", + "name": "Bulletin de l'Académie Polonaise des Sciences. Série des Sciences Techniques", + "publication": "resource/ISSN/0001-4125#ReferencePublicationEvent", + "title": "resource/ISSN/0001-4125#KeyTitle", + "type": "http://marc21rdf.info/terms/formofmaterial#a" + }, + { + "@id": "resource/ISSN/0001-4125#ISSN", + "@type": "http://id.loc.gov/ontologies/bibframe/Issn", + "status": "vocabularies/IdentifierStatus#Valid", + "value": "0001-4125" + }, + { + "@id": "resource/ISSN/0001-4125#ISSN-L", + "@type": "http://id.loc.gov/ontologies/bibframe/IssnL", + "status": "vocabularies/IdentifierStatus#Valid", + "value": "0001-4125" + }, + { + "@id": "resource/ISSN/0001-4125#KeyTitle", + "@type": [ + "http://id.loc.gov/ontologies/bibframe/Identifier", + "http://id.loc.gov/ontologies/bibframe/KeyTitle" + ], + "value": "Bulletin de l'Académie Polonaise des Sciences. Série des Sciences Techniques" + }, + { + "@id": "resource/ISSN/0001-4125#Record", + "@type": "http://schema.org/CreativeWork", + "mainEntity": "resource/ISSN/0001-4125", + "modified": "20051223105700.0", + "status": "vocabularies/RecordStatus#Register", + "wasAttributedTo": "organization/ISSNCenter#57" + }, + { + "@id": "resource/ISSN/0001-4125#ReferencePublicationEvent", + "@type": "http://schema.org/PublicationEvent", + "location": "http://id.loc.gov/vocabulary/countries/pl" + } + ] +} + + """ import argparse @@ -28,11 +164,61 @@ import itertools import json import os import sys -from typing import Iterable +import re +from typing import Iterable, Dict from fuzzycat.utils import SetEncoder +def jsonld_minimal(v: Dict) -> Dict: + """ + Turn a JSON from issn.org into a smaller dict with a few core fields. + + Example result: {'issnl': '0008-2554', 'issns': {'0008-2554'}, 'names': + ['Canada agriculture (Ottawa)', 'Canada agriculture.']} + """ + items = v.get("@graph") + if not items: + return {} + doc = {} + for item in items: + pass + # "@id": "resource/ISSN-L/0001-4125" + # "@id": "resource/ISSN/0001-4125" + # ... + id = item.get("@id") + if not id: + continue + match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", id) + if match: + doc["issnl"] = match.group(1) + continue + match = re.match(r"^resource/ISSN/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", id) + if match: + # Collect ids. + issns = set([match.group(1)]) + if item.get("identifier"): + issns.add(item.get("identifier")) + if item.get("issn"): + issns.add(item.get("issn")) + doc["issns"] = issns + + names = item.get("name") + if isinstance(names, str): + names = [names] + if isinstance(names, list): + doc["names"] = names + + isFormatOf = item.get("isFormatOf", []) + if isinstance(isFormatOf, str): + isFormatOf = [isFormatOf] + + for v in isFormatOf: + match = re.match(r"^resource/ISSN/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", v) + if match: + doc["issns"].add(match.group(1)) + return doc + def generate_name_pairs(lines: Iterable): """ Given JSON lines, yield a tuple (issn, a, b) of test data. Will skip on @@ -42,33 +228,12 @@ def generate_name_pairs(lines: Iterable): line = line.strip() try: doc = json.loads(line) + doc = jsonld_minimal(doc) except json.decoder.JSONDecodeError as exc: print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr) continue - for item in doc.get("@graph", []): - issn = item.get("issn", "") - if not issn: - continue - if len(issn) != 9: - continue - if issn[4] != "-": - continue - names = item.get("name") - if not names: - continue - if isinstance(names, str): - names = [names] - if not isinstance(names, list): - raise ValueError("expected a list: {} {}".format(names, type(names))) - if len(names) < 2: - continue - - # Some names contain whitespace in the database, " Mystery & - # detection annual" -- control character prefixes (e.g. C2 98) - # remain. - names = [s.strip() for s in names] - - for a, b in itertools.combinations(names, 2): + for issn in doc.get("issns", []): + for a, b in itertools.combinations(doc.get("names", []), 2): yield (issn, a, b) |