diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-08-15 18:18:25 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-08-15 18:18:25 +0200 |
commit | 9b416db2393988ae5bf097f754e885848ee31636 (patch) | |
tree | 11f4cbccd131d2d8f8c8b71fcfc3b93fb871181e | |
parent | 2f948cfbb484241178fa7e8c7abd8b0c40a9db24 (diff) | |
download | fuzzycat-9b416db2393988ae5bf097f754e885848ee31636.tar.gz fuzzycat-9b416db2393988ae5bf097f754e885848ee31636.zip |
separate cleanups
-rw-r--r-- | fuzzycat/cleanups.py | 17 | ||||
-rw-r--r-- | fuzzycat/journals.py | 30 |
2 files changed, 47 insertions, 0 deletions
diff --git a/fuzzycat/cleanups.py b/fuzzycat/cleanups.py new file mode 100644 index 0000000..d806e51 --- /dev/null +++ b/fuzzycat/cleanups.py @@ -0,0 +1,17 @@ + +""" +Various shared cleanup approaches. +""" + +from fuzzycat.utils import StringPipeline, normalize_whitespace, normalize_ampersand + + +# These transformations should not affect the name or a journal. +basic = StringPipeline([ + str.lower, + normalize_whitespace, + normalize_ampersand, + lambda v: v.rstrip("."), +]) + + diff --git a/fuzzycat/journals.py b/fuzzycat/journals.py new file mode 100644 index 0000000..8c61f3a --- /dev/null +++ b/fuzzycat/journals.py @@ -0,0 +1,30 @@ +# coding: utf-8 + +""" +Journal name matching. Includes names from issn database and abbreviations. +""" + +import shelve + +class JournalLookup: + """ + Lookup allows to lookup journals, using a database of real journal names. + + >>> lookup = JournalLookup() + >>> lookup["Philosophica"] + {'1857-9272', '2232-299X', '2232-3007', '2232-3015'} + + """ + def __init__(self, namedb='namedb'): + self.db = shelve.open(namedb) + + def __getitem__(self, v): + return self.db[v] + + def get(self, v, cleanup_pipeline=None): + if not cleanup_pipeline: + return self.db[v] + return self.db[cleanup_pipeline(v)] + + def close(self): + self.db.close() |