include original and normalized name in default shelve (1G)

author: Martin Czygan <martin.czygan@gmail.com> 2020-08-15 18:39:33 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2020-08-15 18:39:33 +0200
commit: c8763acfb2504951f0173c2cd249263f8ebf13ae (patch)
tree: 90c8f4cdb6389e2ac1ead036f052cde0749f35f9
parent: 9b416db2393988ae5bf097f754e885848ee31636 (diff)
download: fuzzycat-c8763acfb2504951f0173c2cd249263f8ebf13ae.tar.gz
fuzzycat-c8763acfb2504951f0173c2cd249263f8ebf13ae.zip
3 files changed, 16 insertions, 8 deletions
diff --git a/Makefile b/Makefile
index f498609..d9cfa05 100644
--- a/Makefile
+++ b/Makefile
@@ -62,3 +62,6 @@ data/container_export.json.gz: ## Download container export
 
 data/name_to_issn.json: data/issn.ndj ## Create a name to ISSN mapping (needs an ISSN JSON dump)
 	fuzzycat-issn --make-mapping $^ > $@
+
+names.db: data/issn.ndj
+	fuzzycat-issn --make-shelve -c basic -o names $^
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
index 46786c9..064604c 100644
--- a/fuzzycat/issn.py
+++ b/fuzzycat/issn.py
@@ -274,7 +274,7 @@ def de_jsonld(lines: Iterable):
             print(json.dumps(doc, cls=SetEncoder))
 
 
-def generate_name_pairs(lines: Iterable, cleanup_pipeline=None):
+def generate_name_pairs(lines: Iterable, cleanup_pipeline=None, keep_original=True):
     """
     Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on
     errors. Proto unit test data.
@@ -297,7 +297,7 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=None):
     0040-2249       Tehnika kino i televideniâ      Техника кино и телевидения.
     0040-2249       Техника кино и телевидения      Техника кино и телевидения.
 
-    New: apply transformations on keys.
+    If cleanup_pipeline is given, additionally add
     """
     for line in lines:
         line = line.strip()
@@ -307,11 +307,12 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=None):
             print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
             continue
         for a, b in itertools.combinations(doc.get("names", []), 2):
+            if cleanup_pipeline is None or (cleanup_pipeline is not None and keep_original):
+                yield (doc["issnl"], a, b)
             if cleanup_pipeline:
                 a = cleanup_pipeline(a)
                 b = cleanup_pipeline(b)
-            yield (doc["issnl"], a, b)
-
+                yield (doc["issnl"], a, b)
 
 def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None):
     """
@@ -327,7 +328,8 @@ def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None):
 
 def generate_shelve(lines: Iterable, output: str, cleanup_pipeline=None):
     """
-    Generate a persistent key value store from name issn mappings.
+    Generate a persistent key value store from name issn mappings. 5015523
+    entries, 1.1G take about 5min.
     """
     with shelve.open(output) as db:
         for name, issnls in generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline).items():
diff --git a/fuzzycat/journals.py b/fuzzycat/journals.py
index 8c61f3a..bd76b7f 100644
--- a/fuzzycat/journals.py
+++ b/fuzzycat/journals.py
@@ -15,7 +15,10 @@ class JournalLookup:
         {'1857-9272', '2232-299X', '2232-3007', '2232-3015'}
 
     """
-    def __init__(self, namedb='namedb'):
+    def __init__(self, namedb='names'):
+        """
+        Note that shelve appends "db" to the name automatically.
+        """
         self.db = shelve.open(namedb)
 
     def __getitem__(self, v):
@@ -23,8 +26,8 @@ class JournalLookup:
 
     def get(self, v, cleanup_pipeline=None):
         if not cleanup_pipeline:
-            return self.db[v]
-        return self.db[cleanup_pipeline(v)]
+            return self.db.get(v)
+        return self.db.get(cleanup_pipeline(v))
 
     def close(self):
         self.db.close()
author	Martin Czygan <martin.czygan@gmail.com>	2020-08-15 18:39:33 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2020-08-15 18:39:33 +0200
commit	c8763acfb2504951f0173c2cd249263f8ebf13ae (patch)
tree	90c8f4cdb6389e2ac1ead036f052cde0749f35f9
parent	9b416db2393988ae5bf097f754e885848ee31636 (diff)
download	fuzzycat-c8763acfb2504951f0173c2cd249263f8ebf13ae.tar.gz fuzzycat-c8763acfb2504951f0173c2cd249263f8ebf13ae.zip