aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/utils.py
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-12-15 02:39:35 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-12-15 02:39:35 +0100
commitf939949ae3889078ef2c9d77d1cffdd939e11435 (patch)
treeeeffa30293db814523582125f1b94c6e7e3517f5 /fuzzycat/utils.py
parentb18c9c73150679a8e1ac92cd0bea7a649de0b39b (diff)
downloadfuzzycat-f939949ae3889078ef2c9d77d1cffdd939e11435.tar.gz
fuzzycat-f939949ae3889078ef2c9d77d1cffdd939e11435.zip
single item verification
Diffstat (limited to 'fuzzycat/utils.py')
-rw-r--r--fuzzycat/utils.py34
1 files changed, 34 insertions, 0 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 682f912..cf74220 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -1,6 +1,8 @@
import collections
import io
import itertools
+import os
+import random
import re
import string
@@ -13,6 +15,7 @@ CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
ParsedPages = collections.namedtuple("ParsedPages", "start end count")
+
def parse_page_string(s):
"""
Parse typical page strings, e.g. 150-180.
@@ -35,6 +38,7 @@ def parse_page_string(s):
count = b - a + 1
return ParsedPages(start=a, end=b, count=count)
+
def dict_key_exists(doc, path):
"""
Return true, if key in a dictionary at a given path exists. XXX: probably
@@ -138,3 +142,33 @@ def contains_chemical_formula(s):
for token in s.split():
if CHEM_FORMULA.search(token):
return True
+
+
+def random_word(func=lambda w: True, wordsfile='/usr/share/dict/words'):
+ """
+ Requires the UNIX words file in a typical location. Returns a single,
+ random word.
+ """
+ if not os.path.exists(wordsfile):
+ raise RuntimeError('file not found: {}'.format(wordsfile))
+ with open(wordsfile) as f:
+ words = list(filter(func, (word.strip() for word in f)))
+ return random.choice(words)
+
+
+def random_idents_from_query(query="*",
+ es="https://search.fatcat.wiki/fatcat_release/_search",
+ max_retries=10,
+ r=2):
+ """
+ Return a number of random idents from a search query.
+ """
+ for _ in range(max_retries):
+ r = requests.get(es, params={"q": query})
+ if r.status_code != 200:
+ raise RuntimeError('could not query {} for random item: {}'.format(es, r.url))
+ resp = r.json()
+ if resp["hits"]["total"] < 2:
+ continue
+ idents = [doc["_source"]["ident"] for doc in resp["hits"]["hits"]]
+ return random.sample(idents, r)