diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2020-08-15 18:39:33 +0200 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2020-08-15 18:39:33 +0200 | 
| commit | c8763acfb2504951f0173c2cd249263f8ebf13ae (patch) | |
| tree | 90c8f4cdb6389e2ac1ead036f052cde0749f35f9 /fuzzycat | |
| parent | 9b416db2393988ae5bf097f754e885848ee31636 (diff) | |
| download | fuzzycat-c8763acfb2504951f0173c2cd249263f8ebf13ae.tar.gz fuzzycat-c8763acfb2504951f0173c2cd249263f8ebf13ae.zip  | |
include original and normalized name in default shelve (1G)
Diffstat (limited to 'fuzzycat')
| -rw-r--r-- | fuzzycat/issn.py | 12 | ||||
| -rw-r--r-- | fuzzycat/journals.py | 9 | 
2 files changed, 13 insertions, 8 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py index 46786c9..064604c 100644 --- a/fuzzycat/issn.py +++ b/fuzzycat/issn.py @@ -274,7 +274,7 @@ def de_jsonld(lines: Iterable):              print(json.dumps(doc, cls=SetEncoder)) -def generate_name_pairs(lines: Iterable, cleanup_pipeline=None): +def generate_name_pairs(lines: Iterable, cleanup_pipeline=None, keep_original=True):      """      Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on      errors. Proto unit test data. @@ -297,7 +297,7 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=None):      0040-2249       Tehnika kino i televideniâ      Техника кино и телевидения.      0040-2249       Техника кино и телевидения      Техника кино и телевидения. -    New: apply transformations on keys. +    If cleanup_pipeline is given, additionally add      """      for line in lines:          line = line.strip() @@ -307,11 +307,12 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=None):              print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)              continue          for a, b in itertools.combinations(doc.get("names", []), 2): +            if cleanup_pipeline is None or (cleanup_pipeline is not None and keep_original): +                yield (doc["issnl"], a, b)              if cleanup_pipeline:                  a = cleanup_pipeline(a)                  b = cleanup_pipeline(b) -            yield (doc["issnl"], a, b) - +                yield (doc["issnl"], a, b)  def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None):      """ @@ -327,7 +328,8 @@ def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None):  def generate_shelve(lines: Iterable, output: str, cleanup_pipeline=None):      """ -    Generate a persistent key value store from name issn mappings. +    Generate a persistent key value store from name issn mappings. 5015523 +    entries, 1.1G take about 5min.      """      with shelve.open(output) as db:          for name, issnls in generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline).items(): diff --git a/fuzzycat/journals.py b/fuzzycat/journals.py index 8c61f3a..bd76b7f 100644 --- a/fuzzycat/journals.py +++ b/fuzzycat/journals.py @@ -15,7 +15,10 @@ class JournalLookup:          {'1857-9272', '2232-299X', '2232-3007', '2232-3015'}      """ -    def __init__(self, namedb='namedb'): +    def __init__(self, namedb='names'): +        """ +        Note that shelve appends "db" to the name automatically. +        """          self.db = shelve.open(namedb)      def __getitem__(self, v): @@ -23,8 +26,8 @@ class JournalLookup:      def get(self, v, cleanup_pipeline=None):          if not cleanup_pipeline: -            return self.db[v] -        return self.db[cleanup_pipeline(v)] +            return self.db.get(v) +        return self.db.get(cleanup_pipeline(v))      def close(self):          self.db.close()  | 
