try out shelve for name lookups

uncompressed about 500 MB; marisa-trie would need extra encoding approach (plus it is a heavy dependency).
author: Martin Czygan <martin.czygan@gmail.com> 2020-08-15 17:37:13 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2020-08-15 17:37:13 +0200
commit: a4880d051a5b89a58b55da8f85ac2a5acb612a16 (patch)
tree: 3387a0be1d6b7408ee92a1aed89b0cf0854102a3
parent: 842a1ce9da4c48556986fad78dc40673421393f6 (diff)
download: fuzzycat-a4880d051a5b89a58b55da8f85ac2a5acb612a16.tar.gz
fuzzycat-a4880d051a5b89a58b55da8f85ac2a5acb612a16.zip
1 files changed, 62 insertions, 10 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
index d95fa9a..28d2812 100644
--- a/fuzzycat/issn.py
+++ b/fuzzycat/issn.py
@@ -163,10 +163,11 @@ import itertools
 import json
 import os
 import re
+import shelve
 import sys
 from typing import Dict, Iterable, List, Union
 
-from fuzzycat.utils import SetEncoder
+from fuzzycat.utils import (SetEncoder, StringPipeline, normalize_ampersand, normalize_whitespace)
 
 
 def listify(v: Union[str, List[str]]) -> List[str]:
@@ -182,10 +183,21 @@ def listify(v: Union[str, List[str]]) -> List[str]:
 
 def jsonld_minimal(v: Dict) -> Dict:
     """
-    Turn a JSON from issn.org into a smaller dict with a few core fields.
+    Turn a JSON from issn.org into a smaller dict with a few core fields.  Will
+    fail, if no ISSN-L is found in the input.
+
+    {
+      "issnl": "0001-4125",
+      "material": [],
+      "issns": [
+	"0001-4125"
+      ],
+      "urls": [],
+      "names": [
+	"Bulletin de l'Académie Polonaise des Sciences. Série des Sciences Techniques"
+      ]
+    }
 
-    Example result: {'issnl': '0008-2554', 'issns': {'0008-2554'}, 'names':
-        ['Canada agriculture (Ottawa)', 'Canada agriculture.']}
     """
     items = v.get("@graph")
     if not items:
@@ -247,7 +259,7 @@ def jsonld_minimal(v: Dict) -> Dict:
 
 def de_jsonld(lines: Iterable):
     """
-    Convert to a minimal JSON format.
+    Batch convert to minimal JSON.
     """
     for line in lines:
         line = line.strip()
@@ -260,10 +272,19 @@ def de_jsonld(lines: Iterable):
             print(json.dumps(doc, cls=SetEncoder))
 
 
-def generate_name_pairs(lines: Iterable):
+# These transformations should not affect the name or a journal.
+cleanup_pipeline = StringPipeline([
+    str.lower,
+    normalize_whitespace,
+    normalize_ampersand,
+    lambda v: v.rstrip("."),
+])
+
+
+def generate_name_pairs(lines: Iterable, cleanup_pipeline=cleanup_pipeline):
     """
     Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on
-    errors.
+    errors. Proto unit test data.
 
     Example output:
 
@@ -271,8 +292,19 @@ def generate_name_pairs(lines: Iterable):
     0012-7388       Dynamic maturity        Dynamic maturity.
     0012-6055       Drehpunkt.      Drehpunkt (Basel. 1968)
 
-    Basically, these would be free test cases, since we would like to report "match" on most of these.
+    Basically, these would be free test cases, since we would like to report
+    "match" on most of these.
+
+    That can be useful to detect various scripts refering to the same journal.
+
+    0040-2249       Tehnika kino i televideniâ.     Tehnika kino i televideniâ
+    0040-2249       Tehnika kino i televideniâ.     Техника кино и телевидения
+    0040-2249       Tehnika kino i televideniâ.     Техника кино и телевидения.
+    0040-2249       Tehnika kino i televideniâ      Техника кино и телевидения
+    0040-2249       Tehnika kino i televideniâ      Техника кино и телевидения.
+    0040-2249       Техника кино и телевидения      Техника кино и телевидения.
 
+    New: apply transformations on keys.
     """
     for line in lines:
         line = line.strip()
@@ -282,6 +314,8 @@ def generate_name_pairs(lines: Iterable):
             print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
             continue
         for a, b in itertools.combinations(doc.get("names", []), 2):
+            a = cleanup_pipeline.run(a)
+            b = cleanup_pipeline.run(b)
             yield (doc["issnl"], a, b)
 
 
@@ -297,6 +331,16 @@ def generate_name_issn_mapping(lines: Iterable):
     return mapping
 
 
+def generate_shelve(lines: Iterable, output: str):
+    """
+    Generate a persistent key value store from name issn mappings.
+    """
+    with shelve.open(output) as db:
+        for name, issnls in generate_name_issn_mapping(lines).items():
+            db[name] = issnls
+        print("wrote {} keys to {}".format(len(db), output), file=sys.stderr)
+
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("file",
@@ -309,16 +353,24 @@ def main():
     parser.add_argument("--make-mapping",
                         action="store_true",
                         help="generate JSON mapping from name to list of ISSN")
+    parser.add_argument("--make-shelve",
+                        action="store_true",
+                        help="generate trie mapping from name to list of ISSN")
+    parser.add_argument("-o",
+                        "--output",
+                        type=str,
+                        default="output.file",
+                        help="write output to file")
     parser.add_argument("--de-jsonld", action="store_true", help="break up the jsonld")
 
     args = parser.parse_args()
 
     if args.make_mapping:
         print(json.dumps(generate_name_issn_mapping(args.file), cls=SetEncoder))
-
     if args.make_pairs:
         for issn, a, b in generate_name_pairs(args.file):
             print("{}\t{}\t{}".format(issn, a, b))
-
     if args.de_jsonld:
         de_jsonld(args.file)
+    if args.make_shelve:
+        generate_shelve(args.file, output=args.output)
author	Martin Czygan <martin.czygan@gmail.com>	2020-08-15 17:37:13 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2020-08-15 17:37:13 +0200
commit	a4880d051a5b89a58b55da8f85ac2a5acb612a16 (patch)
tree	3387a0be1d6b7408ee92a1aed89b0cf0854102a3
parent	842a1ce9da4c48556986fad78dc40673421393f6 (diff)
download	fuzzycat-a4880d051a5b89a58b55da8f85ac2a5acb612a16.tar.gz fuzzycat-a4880d051a5b89a58b55da8f85ac2a5acb612a16.zip