fuzzycat/utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310

import collections
import itertools
import os
import random
import re
import string
import subprocess
import tempfile
from typing import Optional

import requests
from glom import PathAccessError, glom
from zstandard import ZstdDecompressor

printable_no_punct = string.digits + string.ascii_letters + string.whitespace

# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")

ParsedPages = collections.namedtuple("ParsedPages", "start end count")


def es_compat_hits_total(resp):
    """
    Given a search response dict, support ES6 and ES7 style total value. See:
    https://www.elastic.co/guide/en/elasticsearch/reference/current/breaking-changes-7.0.html

    It is responsibility of the call site to set `track_total_hits` in ES7 to
    get an exact number.
    """
    try:
        return resp["hits"]["total"]["value"]
    except TypeError:
        return resp["hits"]["total"]


def parse_page_string(s):
    """
    Parse typical page strings, e.g. 150-180 or p123.

    If only a single page number is found, returns that first page and None for
    end page and count. If two are found, and they are consistent as a range,
    returns the start, end, and count.

    Does not handle lists of page numbers, roman numerals, and several other
    patterns.
    """
    if not s:
        raise ValueError('page parsing: empty string')
    if s[0].lower() in ('p', 'e'):
        s = s[1:]
    if s.isnumeric():
        return ParsedPages(start=int(s), end=None, count=None)
    page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})")
    match = page_pattern.match(s)
    if not match:
        raise ValueError('cannot parse page pattern from {}'.format(s))
    start, end = match.groups()
    if len(end) == 1 and start and start[-1] < end:
        # '261-5', odd, but happens
        end = start[:-1] + end
    elif len(end) == 2 and start and start[-2:] < end:
        # '577-89', also happens
        end = start[:-2] + end
    a, b = int(start), int(end)
    if a > b:
        raise ValueError('invalid page range: {}'.format(s))
    count = b - a + 1
    return ParsedPages(start=a, end=b, count=count)


def dict_key_exists(doc, path):
    """
    Return true, if key in a dictionary at a given path exists. XXX: probably
    already in glom.
    """
    try:
        _ = glom(doc, path)
    except PathAccessError:
        return False
    else:
        return True


def clean_doi(raw: Optional[str]) -> Optional[str]:
    if not raw:
        return None
    raw = raw.strip().lower()
    if raw.startswith("doi:"):
        raw = raw[4:]
    if not "10." in raw:
        return None
    if not raw.startswith("10."):
        raw = raw[raw.find("10."):]
    if raw[7:9] == "//":
        raw = raw[:8] + raw[9:]
    return raw


def doi_prefix(v):
    """
    Return the prefix of a DOI.
    """
    return v.split("/")[0]


def has_doi_prefix(v, prefix="10.1234"):
    """
    Returns False, if we cannot parse v or prefix does not match.
    """
    if not v:
        return False
    return v.split("/")[0] == prefix


def slugify_string(s: str) -> str:
    """
    Keeps ascii chars and single whitespace only.
    """
    return ' '.join(''.join((c for c in s.lower() if c in printable_no_punct)).split())


def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
    """
    Return a callable that extracts a given column from a line.
    """
    def func(value):
        parts = value.strip().split(sep)
        if f >= len(parts):
            if ignore_missing_column:
                return ""
            raise ValueError('cannot split value {} into {} parts'.format(value, f))
        return parts[f]

    return func


def author_similarity_score(u, v):
    """
    Given two author strings, return a similarity score between 0 and 1.
    """
    return jaccard(set(token_n_grams(u)), set(token_n_grams(v)))


def jaccard(a, b):
    """
    Jaccard of sets a and b.
    """
    if len(a | b) == 0:
        return 0
    return len(a & b) / len(a | b)


def token_n_grams(s, n=2):
    """
    Return n-grams, calculated per token.
    """
    return ["".join(v) for v in itertools.chain(*[nwise(v, n=n) for v in tokenize_string(s)])]


def tokenize_string(s):
    """
    Normalize and tokenize, should be broken up.
    """
    return [token for token in s.lower().split()]


def nwise(iterable, n=2):
    """
    Generalized: func: `pairwise`. Split an iterable after every
    `n` items.
    """
    i = iter(iterable)
    piece = tuple(itertools.islice(i, n))
    while piece:
        yield piece
        piece = tuple(itertools.islice(i, n))


def num_project(s):
    """
    Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq,
    https://fatcat.wiki/release/7hgzqz3hrngq7omtwdxz4qx34u

    Unify every occurence of a digit (or group of digits).
    """
    return re.sub(r'\d+', '<NUM>', s)


def contains_chemical_formula(s):
    """
    Returns true, if we find C3H8O or the like in title.
    """
    for token in s.split():
        if CHEM_FORMULA.search(token):
            return True


def random_word(func=lambda w: True, wordsfile='/usr/share/dict/words'):
    """
    Requires the UNIX words file in a typical location. Returns a single,
    random word.
    """
    if not os.path.exists(wordsfile):
        raise RuntimeError('file not found: {}'.format(wordsfile))
    with open(wordsfile) as f:
        words = list(filter(func, (word.strip() for word in f)))
    return random.choice(words)


def random_idents_from_query(query="*",
                             es="https://search.fatcat.wiki/fatcat_release/_search",
                             r=2):
    """
    Return a number of random idents from a search query.
    """
    resp = requests.get(es, params={"q": query})
    if resp.status_code != 200:
        raise RuntimeError('could not query {} for random item: {}'.format(es, r.url))
    payload = resp.json()
    if es_compat_hits_total(payload) < 2:
        raise RuntimeError('to few documents')
    idents = [doc["_source"]["ident"] for doc in payload["hits"]["hits"]]
    return random.sample(idents, r)


def zstdlines(filename, encoding="utf-8", bufsize=65536):
    """
    Generator over lines from a zstd compressed file.

    >>> for line in zstdlines("file.zst"):
    ...     print(line)

    """
    with open(filename, "rb") as f:
        decomp = ZstdDecompressor()
        with decomp.stream_reader(f) as reader:
            prev_line = ""
            while True:
                chunk = reader.read(bufsize)
                if not chunk:
                    break
                while True:
                    # We start with bytes but want unicode, which might not
                    # align; so we jitter around the end to complete the
                    # codepoint.
                    try:
                        string_data = chunk.decode(encoding)
                    except UnicodeDecodeError:
                        chunk = chunk + reader.read(1)
                    else:
                        break
                lines = string_data.split("\n")
                for i, line in enumerate(lines[:-1]):
                    if i == 0:
                        line = prev_line + line
                    yield line
                prev_line = lines[-1]


def shellout(template,
             preserve_whitespace=False,
             executable='/bin/bash',
             ignoremap=None,
             encoding=None,
             pipefail=True,
             **kwargs):
    """
    Takes a shell command template and executes it. The template must use the
    new (2.6+) format mini language. `kwargs` must contain any defined
    placeholder, only `output` is optional and will be autofilled with a
    temporary file if it used, but not specified explicitly.

    If `pipefail` is `False` no subshell environment will be spawned, where a
    failed pipe will cause an error as well. If `preserve_whitespace` is `True`,
    no whitespace normalization is performed. A custom shell executable name can
    be passed in `executable` and defaults to `/bin/bash`.

    Raises RuntimeError on nonzero exit codes. To ignore certain errors, pass a
    dictionary in `ignoremap`, with the error code to ignore as key and a string
    message as value.

    Simple template:

        wc -l < {input} > {output}

    Quoted curly braces:

        ps ax|awk '{{print $1}}' > {output}

    """
    if not 'output' in kwargs:
        kwargs.update({'output': tempfile.mkstemp(prefix='gluish-')[1]})
    if ignoremap is None:
        ignoremap = {}
    if encoding:
        command = template.decode(encoding).format(**kwargs)
    else:
        command = template.format(**kwargs)
    if not preserve_whitespace:
        command = re.sub('[ \t\n]+', ' ', command)
    if pipefail:
        command = '(set -o pipefail && %s)' % command
    code = subprocess.call([command], shell=True, executable=executable)
    if not code == 0:
        if code not in ignoremap:
            error = RuntimeError('%s exitcode: %s' % (command, code))
            error.code = code
            raise error
    return kwargs.get('output')