cleanup

author: Martin Czygan <martin.czygan@gmail.com> 2020-10-21 03:54:53 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2020-10-21 03:54:53 +0200
commit: c134c0974d0fc8b57a0d3329d389ac72120a01bb (patch)
tree: 676ccc717e96dcca92e56e2a490c5c0d18240f45 /fuzzycat/issn.py
parent: 2cd5ec9f9c3c91dfe79c98f7d73112b88061d383 (diff)
download: fuzzycat-c134c0974d0fc8b57a0d3329d389ac72120a01bb.tar.gz
fuzzycat-c134c0974d0fc8b57a0d3329d389ac72120a01bb.zip
1 files changed, 0 insertions, 401 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
deleted file mode 100644
index aa6b78a..0000000
--- a/fuzzycat/issn.py
+++ /dev/null
@@ -1,401 +0,0 @@
-"""
-Munge the ISSN data so we get some container name test data out of it.
-
-      ...
-      "issn": "0000-0019",
-      "mainTitle": "<U+0098>The <U+009C>publishers weekly.",
-      "name": [
-        "<U+0098>The <U+009C>publishers weekly.",
-        "Publishers weekly"
-      ],
-
-Public data from ISSN via:
-https://portal.issn.org/resource/ISSN/0874-2308?format=json, and issnlister
-(https://github.com/miku/issnlister) to aggregate.
-
-The dataset contains naming variants in "name".
-
-Q1: How many of these variants would our matching algorithm detect?
-
-For that, we need a dataset that generates pairs (a, b) from all names (the
-mainTitle is just one of the name).
-
-Example JSON LD response from ISSN:
-
-{
-  "@context": {
-    "format": {
-      "@id": "http://purl.org/dc/elements/1.1/format",
-      "@type": "@id"
-    },
-    "identifiedBy": {
-      "@id": "http://id.loc.gov/ontologies/bibframe/identifiedBy",
-      "@type": "@id"
-    },
-    "identifier": {
-      "@id": "http://purl.org/dc/elements/1.1/identifier"
-    },
-    "isPartOf": {
-      "@id": "http://schema.org/isPartOf",
-      "@type": "@id"
-    },
-    "issn": {
-      "@id": "http://purl.org/ontology/bibo/issn"
-    },
-    "label": {
-      "@id": "http://www.w3.org/2000/01/rdf-schema#label"
-    },
-    "location": {
-      "@id": "http://schema.org/location",
-      "@type": "@id"
-    },
-    "mainEntity": {
-      "@id": "http://schema.org/mainEntity",
-      "@type": "@id"
-    },
-    "modified": {
-      "@id": "http://purl.org/dc/terms/modified",
-      "@type": "http://www.w3.org/2001/XMLSchema#dateTime"
-    },
-    "name": {
-      "@id": "http://schema.org/name"
-    },
-    "publication": {
-      "@id": "http://schema.org/publication",
-      "@type": "@id"
-    },
-    "status": {
-      "@id": "http://id.loc.gov/ontologies/bibframe/status",
-      "@type": "@id"
-    },
-    "title": {
-      "@id": "http://id.loc.gov/ontologies/bibframe/title",
-      "@type": "@id"
-    },
-    "type": {
-      "@id": "http://purl.org/dc/terms/type",
-      "@type": "@id"
-    },
-    "value": {
-      "@id": "http://www.w3.org/1999/02/22-rdf-syntax-ns#value"
-    },
-    "wasAttributedTo": {
-      "@id": "http://www.w3.org/ns/prov#wasAttributedTo",
-      "@type": "@id"
-    }
-  },
-  "@graph": [
-    {
-      "@id": "http://id.loc.gov/vocabulary/countries/pl",
-      "label": "Poland"
-    },
-    {
-      "@id": "organization/ISSNCenter#57",
-      "@type": "http://schema.org/Organization"
-    },
-    {
-      "@id": "resource/ISSN-L/0001-4125",
-      "identifiedBy": "resource/ISSN/0001-4125#ISSN-L"
-    },
-    {
-      "@id": "resource/ISSN/0001-4125",
-      "@type": [
-        "http://schema.org/Periodical",
-        "http://id.loc.gov/ontologies/bibframe/Instance",
-        "http://id.loc.gov/ontologies/bibframe/Work"
-      ],
-      "format": "vocabularies/medium#Print",
-      "http://schema.org/issn": "0001-4125",
-      "identifiedBy": [
-        "resource/ISSN/0001-4125#ISSN-L",
-        "resource/ISSN/0001-4125#KeyTitle",
-        "resource/ISSN/0001-4125#ISSN"
-      ],
-      "identifier": "0001-4125",
-      "isPartOf": "resource/ISSN-L/0001-4125",
-      "issn": "0001-4125",
-      "name": "Bulletin de l'Académie Polonaise des Sciences. Série des Sciences Techniques",
-      "publication": "resource/ISSN/0001-4125#ReferencePublicationEvent",
-      "title": "resource/ISSN/0001-4125#KeyTitle",
-      "type": "http://marc21rdf.info/terms/formofmaterial#a"
-    },
-    {
-      "@id": "resource/ISSN/0001-4125#ISSN",
-      "@type": "http://id.loc.gov/ontologies/bibframe/Issn",
-      "status": "vocabularies/IdentifierStatus#Valid",
-      "value": "0001-4125"
-    },
-    {
-      "@id": "resource/ISSN/0001-4125#ISSN-L",
-      "@type": "http://id.loc.gov/ontologies/bibframe/IssnL",
-      "status": "vocabularies/IdentifierStatus#Valid",
-      "value": "0001-4125"
-    },
-    {
-      "@id": "resource/ISSN/0001-4125#KeyTitle",
-      "@type": [
-        "http://id.loc.gov/ontologies/bibframe/Identifier",
-        "http://id.loc.gov/ontologies/bibframe/KeyTitle"
-      ],
-      "value": "Bulletin de l'Académie Polonaise des Sciences. Série des Sciences Techniques"
-    },
-    {
-      "@id": "resource/ISSN/0001-4125#Record",
-      "@type": "http://schema.org/CreativeWork",
-      "mainEntity": "resource/ISSN/0001-4125",
-      "modified": "20051223105700.0",
-      "status": "vocabularies/RecordStatus#Register",
-      "wasAttributedTo": "organization/ISSNCenter#57"
-    },
-    {
-      "@id": "resource/ISSN/0001-4125#ReferencePublicationEvent",
-      "@type": "http://schema.org/PublicationEvent",
-      "location": "http://id.loc.gov/vocabulary/countries/pl"
-    }
-  ]
-}
-
-"""
-
-import argparse
-import collections
-import itertools
-import json
-import os
-import re
-import shelve
-import sys
-from typing import Any, Callable, Dict, Generator, Iterable, List, Tuple, Union
-
-from simhash import Simhash
-
-from fuzzycat import cleanups
-from fuzzycat.utils import SetEncoder
-
-
-def listify(v: Union[str, List[str]]) -> List[str]:
-    """
-    Sensible create a list.
-    """
-    if v is None:
-        return []
-    if isinstance(v, str):
-        return [v]
-    return v
-
-
-def jsonld_minimal(v: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Turn a JSON from issn.org into a smaller dict with a few core fields.  Will
-    fail, if no ISSN-L is found in the input.
-
-    {
-      "issnl": "0001-4125",
-      "material": [],
-      "issns": [
-	"0001-4125"
-      ],
-      "urls": [],
-      "names": [
-	"Bulletin de l'Académie Polonaise des Sciences. Série des Sciences Techniques"
-      ]
-    }
-
-    """
-    items = v.get("@graph")
-    if not items:
-        return {}
-    doc = {}
-    for item in items:
-        # "@id": "resource/ISSN-L/0001-4125"
-        # "@id": "resource/ISSN/0001-4125"
-        # ...
-        id = item.get("@id")
-        if not id:
-            continue
-
-        # ISSN-L
-        match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", id)
-        if match:
-            doc["issnl"] = match.group(1)
-            continue
-
-        # The "main" issn entry.
-        match = re.match(r"^resource/ISSN/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", id)
-        if match:
-            # if we do not have ISSN-L yet, check "exampleOfWork",
-            # "resource/ISSN/2658-0705"
-            if not "issnl" in doc:
-                match = re.match(r"^resource/ISSN-L/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$",
-                                 item.get("exampleOfWork", ""))
-                if match:
-                    doc["issnl"] = match.group(1)
-
-            # note material
-            doc["material"] = listify(item.get("material", []))
-
-            # collect ids
-            issns = set([match.group(1)])
-            if item.get("identifier"):
-                issns.add(item.get("identifier"))
-            if item.get("issn"):
-                issns.add(item.get("issn"))
-            doc["issns"] = issns
-            # add urls
-            doc["urls"] = listify(item.get("url", []))
-            # add names, variants
-            names = listify(item.get("name")) + listify(item.get("alternateName"))
-            doc["names"] = list(set(names))
-
-            # add related issn
-            for v in listify(item.get("isFormatOf", [])):
-                match = re.match(r"^resource/ISSN/([0-9]{4,4}-[0-9]{3,3}[0-9xX])$", v)
-                if match:
-                    doc["issns"].add(match.group(1))
-
-    if "issnl" not in doc:
-        raise ValueError("entry without issnl: {}".format(item))
-
-    return doc
-
-
-def de_jsonld(lines: Iterable):
-    """
-    Batch convert jsonld to minimal JSON and write to stdout.
-    """
-    for line in lines:
-        line = line.strip()
-        try:
-            doc = jsonld_minimal(json.loads(line))
-        except json.decoder.JSONDecodeError as exc:
-            print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
-            continue
-        else:
-            print(json.dumps(doc, cls=SetEncoder))
-
-
-def generate_name_pairs(lines: Iterable,
-                        cleanup_pipeline: Callable[[str], str] = None,
-                        keep_original: bool = True) -> Generator[Tuple[str, str, str], None, None]:
-    """
-    Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on
-    errors. Proto unit test data.
-
-    Example output:
-
-    0013-211X       Eendracht-bode (Tholen) Eendracht-bode.
-    0012-7388       Dynamic maturity        Dynamic maturity.
-    0012-6055       Drehpunkt.      Drehpunkt (Basel. 1968)
-
-    Basically, these would be free test cases, since we would like to report
-    "match" on most of these.
-
-    That can be useful to detect various scripts refering to the same journal.
-
-    0040-2249       Tehnika kino i televideniâ.     Tehnika kino i televideniâ
-    0040-2249       Tehnika kino i televideniâ.     Техника кино и телевидения
-    0040-2249       Tehnika kino i televideniâ.     Техника кино и телевидения.
-    0040-2249       Tehnika kino i televideniâ      Техника кино и телевидения
-    0040-2249       Tehnika kino i televideniâ      Техника кино и телевидения.
-    0040-2249       Техника кино и телевидения      Техника кино и телевидения.
-
-    If cleanup_pipeline is given, additionally add
-    """
-    for line in lines:
-        line = line.strip()
-        try:
-            doc = jsonld_minimal(json.loads(line))
-        except json.decoder.JSONDecodeError as exc:
-            print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
-            continue
-        for a, b in itertools.combinations(doc.get("names", []), 2):
-            if cleanup_pipeline is None or (cleanup_pipeline is not None and keep_original):
-                yield (doc["issnl"], a, b)
-            if cleanup_pipeline:
-                a = cleanup_pipeline(a)
-                b = cleanup_pipeline(b)
-                yield (doc["issnl"], a, b)
-
-
-def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline: Callable[[str], str] = None):
-    """
-    Given JSON lines, generate a dictionary mapping names sets of ISSN. Names
-    might be reused.
-    """
-    mapping = collections.defaultdict(set)
-    for issnl, a, b in generate_name_pairs(lines, cleanup_pipeline=cleanup_pipeline):
-        mapping[a].add(issnl)
-        mapping[b].add(issnl)
-    return mapping
-
-
-def generate_shelve(lines: Iterable, output: str, cleanup_pipeline: Callable[[str], str] = None):
-    """
-    Generate a persistent key value store from name issn mappings. 5015523
-    entries, 1.1G take about 5min.
-    """
-    with shelve.open(output) as db:
-        mapping = generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline)
-        for name, issnls in mapping.items():
-            db[name] = issnls
-        print("wrote {} keys to {}".format(len(db), output), file=sys.stderr)
-
-
-def generate_simhash(lines: Iterable):
-    """
-    Print TSV with simhash values.
-
-    Match and non-match count.
-
-    1069447 1
-     927120 0
-    """
-    for issnl, a, b in generate_name_pairs(lines):
-        ha = Simhash(a).value
-        hb = Simhash(b).value
-        row = (issnl, 0 if ha == hb else 1, ha, hb)
-        print("\t".join([str(v) for v in row]))
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("file",
-                        default=sys.stdin,
-                        type=argparse.FileType("r"),
-                        help="public data from issn, one JSON object per line")
-    parser.add_argument("--make-pairs",
-                        action="store_true",
-                        help="generate TSV and write to stdout")
-    parser.add_argument("--make-mapping",
-                        action="store_true",
-                        help="generate JSON mapping from name to list of ISSN")
-    parser.add_argument("--make-shelve",
-                        action="store_true",
-                        help="generate trie mapping from name to list of ISSN")
-    parser.add_argument("--make-simhash", action="store_true", help="print out simhash value")
-    parser.add_argument("-o",
-                        "--output",
-                        type=str,
-                        default="output.file",
-                        help="write output to file")
-    parser.add_argument("-c", "--cleanup", type=str, default=None, help="cleanup pipeline name")
-    parser.add_argument("--de-jsonld", action="store_true", help="break up the jsonld")
-
-    args = parser.parse_args()
-
-    # Add additional cleanup routines here.
-    cleanup = dict(basic=cleanups.basic).get(args.cleanup)
-
-    if args.make_mapping:
-        print(
-            json.dumps(generate_name_issn_mapping(args.file, cleanup_pipeline=cleanup),
-                       cls=SetEncoder))
-    if args.make_pairs:
-        for issn, a, b in generate_name_pairs(args.file, cleanup_pipeline=cleanup):
-            print("{}\t{}\t{}".format(issn, a, b))
-    if args.de_jsonld:
-        de_jsonld(args.file)
-    if args.make_shelve:
-        generate_shelve(args.file, output=args.output, cleanup_pipeline=cleanup)
-    if args.make_simhash:
-        generate_simhash(args.file)
author	Martin Czygan <martin.czygan@gmail.com>	2020-10-21 03:54:53 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2020-10-21 03:54:53 +0200
commit	c134c0974d0fc8b57a0d3329d389ac72120a01bb (patch)
tree	676ccc717e96dcca92e56e2a490c5c0d18240f45 /fuzzycat/issn.py
parent	2cd5ec9f9c3c91dfe79c98f7d73112b88061d383 (diff)
download	fuzzycat-c134c0974d0fc8b57a0d3329d389ac72120a01bb.tar.gz fuzzycat-c134c0974d0fc8b57a0d3329d389ac72120a01bb.zip