aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/issn.py
diff options
context:
space:
mode:
Diffstat (limited to 'fuzzycat/issn.py')
-rw-r--r--fuzzycat/issn.py12
1 files changed, 7 insertions, 5 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
index 46786c9..064604c 100644
--- a/fuzzycat/issn.py
+++ b/fuzzycat/issn.py
@@ -274,7 +274,7 @@ def de_jsonld(lines: Iterable):
print(json.dumps(doc, cls=SetEncoder))
-def generate_name_pairs(lines: Iterable, cleanup_pipeline=None):
+def generate_name_pairs(lines: Iterable, cleanup_pipeline=None, keep_original=True):
"""
Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on
errors. Proto unit test data.
@@ -297,7 +297,7 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=None):
0040-2249 Tehnika kino i televideniâ Техника кино и телевидения.
0040-2249 Техника кино и телевидения Техника кино и телевидения.
- New: apply transformations on keys.
+ If cleanup_pipeline is given, additionally add
"""
for line in lines:
line = line.strip()
@@ -307,11 +307,12 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=None):
print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
continue
for a, b in itertools.combinations(doc.get("names", []), 2):
+ if cleanup_pipeline is None or (cleanup_pipeline is not None and keep_original):
+ yield (doc["issnl"], a, b)
if cleanup_pipeline:
a = cleanup_pipeline(a)
b = cleanup_pipeline(b)
- yield (doc["issnl"], a, b)
-
+ yield (doc["issnl"], a, b)
def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None):
"""
@@ -327,7 +328,8 @@ def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None):
def generate_shelve(lines: Iterable, output: str, cleanup_pipeline=None):
"""
- Generate a persistent key value store from name issn mappings.
+ Generate a persistent key value store from name issn mappings. 5015523
+ entries, 1.1G take about 5min.
"""
with shelve.open(output) as db:
for name, issnls in generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline).items():