diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-12-15 02:39:35 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-12-15 02:39:35 +0100 |
commit | f939949ae3889078ef2c9d77d1cffdd939e11435 (patch) | |
tree | eeffa30293db814523582125f1b94c6e7e3517f5 /fuzzycat/utils.py | |
parent | b18c9c73150679a8e1ac92cd0bea7a649de0b39b (diff) | |
download | fuzzycat-f939949ae3889078ef2c9d77d1cffdd939e11435.tar.gz fuzzycat-f939949ae3889078ef2c9d77d1cffdd939e11435.zip |
single item verification
Diffstat (limited to 'fuzzycat/utils.py')
-rw-r--r-- | fuzzycat/utils.py | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 682f912..cf74220 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -1,6 +1,8 @@ import collections import io import itertools +import os +import random import re import string @@ -13,6 +15,7 @@ CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+") ParsedPages = collections.namedtuple("ParsedPages", "start end count") + def parse_page_string(s): """ Parse typical page strings, e.g. 150-180. @@ -35,6 +38,7 @@ def parse_page_string(s): count = b - a + 1 return ParsedPages(start=a, end=b, count=count) + def dict_key_exists(doc, path): """ Return true, if key in a dictionary at a given path exists. XXX: probably @@ -138,3 +142,33 @@ def contains_chemical_formula(s): for token in s.split(): if CHEM_FORMULA.search(token): return True + + +def random_word(func=lambda w: True, wordsfile='/usr/share/dict/words'): + """ + Requires the UNIX words file in a typical location. Returns a single, + random word. + """ + if not os.path.exists(wordsfile): + raise RuntimeError('file not found: {}'.format(wordsfile)) + with open(wordsfile) as f: + words = list(filter(func, (word.strip() for word in f))) + return random.choice(words) + + +def random_idents_from_query(query="*", + es="https://search.fatcat.wiki/fatcat_release/_search", + max_retries=10, + r=2): + """ + Return a number of random idents from a search query. + """ + for _ in range(max_retries): + r = requests.get(es, params={"q": query}) + if r.status_code != 200: + raise RuntimeError('could not query {} for random item: {}'.format(es, r.url)) + resp = r.json() + if resp["hits"]["total"] < 2: + continue + idents = [doc["_source"]["ident"] for doc in resp["hits"]["hits"]] + return random.sample(idents, r) |