aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-08-15 18:18:25 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-08-15 18:18:25 +0200
commit9b416db2393988ae5bf097f754e885848ee31636 (patch)
tree11f4cbccd131d2d8f8c8b71fcfc3b93fb871181e
parent2f948cfbb484241178fa7e8c7abd8b0c40a9db24 (diff)
downloadfuzzycat-9b416db2393988ae5bf097f754e885848ee31636.tar.gz
fuzzycat-9b416db2393988ae5bf097f754e885848ee31636.zip
separate cleanups
-rw-r--r--fuzzycat/cleanups.py17
-rw-r--r--fuzzycat/journals.py30
2 files changed, 47 insertions, 0 deletions
diff --git a/fuzzycat/cleanups.py b/fuzzycat/cleanups.py
new file mode 100644
index 0000000..d806e51
--- /dev/null
+++ b/fuzzycat/cleanups.py
@@ -0,0 +1,17 @@
+
+"""
+Various shared cleanup approaches.
+"""
+
+from fuzzycat.utils import StringPipeline, normalize_whitespace, normalize_ampersand
+
+
+# These transformations should not affect the name or a journal.
+basic = StringPipeline([
+ str.lower,
+ normalize_whitespace,
+ normalize_ampersand,
+ lambda v: v.rstrip("."),
+])
+
+
diff --git a/fuzzycat/journals.py b/fuzzycat/journals.py
new file mode 100644
index 0000000..8c61f3a
--- /dev/null
+++ b/fuzzycat/journals.py
@@ -0,0 +1,30 @@
+# coding: utf-8
+
+"""
+Journal name matching. Includes names from issn database and abbreviations.
+"""
+
+import shelve
+
+class JournalLookup:
+ """
+ Lookup allows to lookup journals, using a database of real journal names.
+
+ >>> lookup = JournalLookup()
+ >>> lookup["Philosophica"]
+ {'1857-9272', '2232-299X', '2232-3007', '2232-3015'}
+
+ """
+ def __init__(self, namedb='namedb'):
+ self.db = shelve.open(namedb)
+
+ def __getitem__(self, v):
+ return self.db[v]
+
+ def get(self, v, cleanup_pipeline=None):
+ if not cleanup_pipeline:
+ return self.db[v]
+ return self.db[cleanup_pipeline(v)]
+
+ def close(self):
+ self.db.close()