diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-08-15 18:39:33 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-08-15 18:39:33 +0200 |
commit | c8763acfb2504951f0173c2cd249263f8ebf13ae (patch) | |
tree | 90c8f4cdb6389e2ac1ead036f052cde0749f35f9 | |
parent | 9b416db2393988ae5bf097f754e885848ee31636 (diff) | |
download | fuzzycat-c8763acfb2504951f0173c2cd249263f8ebf13ae.tar.gz fuzzycat-c8763acfb2504951f0173c2cd249263f8ebf13ae.zip |
include original and normalized name in default shelve (1G)
-rw-r--r-- | Makefile | 3 | ||||
-rw-r--r-- | fuzzycat/issn.py | 12 | ||||
-rw-r--r-- | fuzzycat/journals.py | 9 |
3 files changed, 16 insertions, 8 deletions
@@ -62,3 +62,6 @@ data/container_export.json.gz: ## Download container export data/name_to_issn.json: data/issn.ndj ## Create a name to ISSN mapping (needs an ISSN JSON dump) fuzzycat-issn --make-mapping $^ > $@ + +names.db: data/issn.ndj + fuzzycat-issn --make-shelve -c basic -o names $^ diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py index 46786c9..064604c 100644 --- a/fuzzycat/issn.py +++ b/fuzzycat/issn.py @@ -274,7 +274,7 @@ def de_jsonld(lines: Iterable): print(json.dumps(doc, cls=SetEncoder)) -def generate_name_pairs(lines: Iterable, cleanup_pipeline=None): +def generate_name_pairs(lines: Iterable, cleanup_pipeline=None, keep_original=True): """ Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on errors. Proto unit test data. @@ -297,7 +297,7 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=None): 0040-2249 Tehnika kino i televideniâ Техника кино и телевидения. 0040-2249 Техника кино и телевидения Техника кино и телевидения. - New: apply transformations on keys. + If cleanup_pipeline is given, additionally add """ for line in lines: line = line.strip() @@ -307,11 +307,12 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=None): print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr) continue for a, b in itertools.combinations(doc.get("names", []), 2): + if cleanup_pipeline is None or (cleanup_pipeline is not None and keep_original): + yield (doc["issnl"], a, b) if cleanup_pipeline: a = cleanup_pipeline(a) b = cleanup_pipeline(b) - yield (doc["issnl"], a, b) - + yield (doc["issnl"], a, b) def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None): """ @@ -327,7 +328,8 @@ def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None): def generate_shelve(lines: Iterable, output: str, cleanup_pipeline=None): """ - Generate a persistent key value store from name issn mappings. + Generate a persistent key value store from name issn mappings. 5015523 + entries, 1.1G take about 5min. """ with shelve.open(output) as db: for name, issnls in generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline).items(): diff --git a/fuzzycat/journals.py b/fuzzycat/journals.py index 8c61f3a..bd76b7f 100644 --- a/fuzzycat/journals.py +++ b/fuzzycat/journals.py @@ -15,7 +15,10 @@ class JournalLookup: {'1857-9272', '2232-299X', '2232-3007', '2232-3015'} """ - def __init__(self, namedb='namedb'): + def __init__(self, namedb='names'): + """ + Note that shelve appends "db" to the name automatically. + """ self.db = shelve.open(namedb) def __getitem__(self, v): @@ -23,8 +26,8 @@ class JournalLookup: def get(self, v, cleanup_pipeline=None): if not cleanup_pipeline: - return self.db[v] - return self.db[cleanup_pipeline(v)] + return self.db.get(v) + return self.db.get(cleanup_pipeline(v)) def close(self): self.db.close() |