aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/utils.py
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-08-12 15:02:36 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-08-12 15:02:36 +0200
commitcf9e283239794247a849b1ad788fa49e664db96e (patch)
treeea0e39e8112f394cfda0ca8ab79f3df77d4a55e3 /fuzzycat/utils.py
parent5a307829670888fedd696e6220c84feed1fe6b64 (diff)
downloadfuzzycat-cf9e283239794247a849b1ad788fa49e664db96e.tar.gz
fuzzycat-cf9e283239794247a849b1ad788fa49e664db96e.zip
issn: generate a name to issn mapping
This allows to make suggestions about potentially ambiguous titles. Maybe suggest a minimal length. Ultimately, there are only about 2M journal titles. If an arbitrary string must match a journal title (not a generic container title), then we can use a combination of direct lookup; plus some extra processing based on this dataset.
Diffstat (limited to 'fuzzycat/utils.py')
-rw-r--r--fuzzycat/utils.py16
1 files changed, 16 insertions, 0 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 3a4be99..97125ce 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -2,6 +2,7 @@
import collections
import itertools
+import json
import re
import string
from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence
@@ -10,6 +11,21 @@ A couple of utilities, may be split up into separate modules.
"""
+class SetEncoder(json.JSONEncoder):
+ """
+ Helper to encode python sets into JSON lists.
+ So you can write something like this:
+ json.dumps({"things": set([1, 2, 3])}, cls=SetEncoder)
+ """
+ def default(self, obj):
+ """
+ Decorate call to standard implementation.
+ """
+ if isinstance(obj, set):
+ return list(obj)
+ return json.JSONEncoder.default(self, obj)
+
+
class StringPipeline:
"""
Minimalistic grouping of functions applied on an input string to produce