aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-08-15 18:39:33 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-08-15 18:39:33 +0200
commitc8763acfb2504951f0173c2cd249263f8ebf13ae (patch)
tree90c8f4cdb6389e2ac1ead036f052cde0749f35f9
parent9b416db2393988ae5bf097f754e885848ee31636 (diff)
downloadfuzzycat-c8763acfb2504951f0173c2cd249263f8ebf13ae.tar.gz
fuzzycat-c8763acfb2504951f0173c2cd249263f8ebf13ae.zip
include original and normalized name in default shelve (1G)
-rw-r--r--Makefile3
-rw-r--r--fuzzycat/issn.py12
-rw-r--r--fuzzycat/journals.py9
3 files changed, 16 insertions, 8 deletions
diff --git a/Makefile b/Makefile
index f498609..d9cfa05 100644
--- a/Makefile
+++ b/Makefile
@@ -62,3 +62,6 @@ data/container_export.json.gz: ## Download container export
data/name_to_issn.json: data/issn.ndj ## Create a name to ISSN mapping (needs an ISSN JSON dump)
fuzzycat-issn --make-mapping $^ > $@
+
+names.db: data/issn.ndj
+ fuzzycat-issn --make-shelve -c basic -o names $^
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
index 46786c9..064604c 100644
--- a/fuzzycat/issn.py
+++ b/fuzzycat/issn.py
@@ -274,7 +274,7 @@ def de_jsonld(lines: Iterable):
print(json.dumps(doc, cls=SetEncoder))
-def generate_name_pairs(lines: Iterable, cleanup_pipeline=None):
+def generate_name_pairs(lines: Iterable, cleanup_pipeline=None, keep_original=True):
"""
Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on
errors. Proto unit test data.
@@ -297,7 +297,7 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=None):
0040-2249 Tehnika kino i televideniâ Техника кино и телевидения.
0040-2249 Техника кино и телевидения Техника кино и телевидения.
- New: apply transformations on keys.
+ If cleanup_pipeline is given, additionally add
"""
for line in lines:
line = line.strip()
@@ -307,11 +307,12 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=None):
print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
continue
for a, b in itertools.combinations(doc.get("names", []), 2):
+ if cleanup_pipeline is None or (cleanup_pipeline is not None and keep_original):
+ yield (doc["issnl"], a, b)
if cleanup_pipeline:
a = cleanup_pipeline(a)
b = cleanup_pipeline(b)
- yield (doc["issnl"], a, b)
-
+ yield (doc["issnl"], a, b)
def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None):
"""
@@ -327,7 +328,8 @@ def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None):
def generate_shelve(lines: Iterable, output: str, cleanup_pipeline=None):
"""
- Generate a persistent key value store from name issn mappings.
+ Generate a persistent key value store from name issn mappings. 5015523
+ entries, 1.1G take about 5min.
"""
with shelve.open(output) as db:
for name, issnls in generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline).items():
diff --git a/fuzzycat/journals.py b/fuzzycat/journals.py
index 8c61f3a..bd76b7f 100644
--- a/fuzzycat/journals.py
+++ b/fuzzycat/journals.py
@@ -15,7 +15,10 @@ class JournalLookup:
{'1857-9272', '2232-299X', '2232-3007', '2232-3015'}
"""
- def __init__(self, namedb='namedb'):
+ def __init__(self, namedb='names'):
+ """
+ Note that shelve appends "db" to the name automatically.
+ """
self.db = shelve.open(namedb)
def __getitem__(self, v):
@@ -23,8 +26,8 @@ class JournalLookup:
def get(self, v, cleanup_pipeline=None):
if not cleanup_pipeline:
- return self.db[v]
- return self.db[cleanup_pipeline(v)]
+ return self.db.get(v)
+ return self.db.get(cleanup_pipeline(v))
def close(self):
self.db.close()