1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
|
import collections
import itertools
import os
import random
import re
import string
import subprocess
import tempfile
from typing import Optional
import requests
from glom import PathAccessError, glom
from zstandard import ZstdDecompressor
printable_no_punct = string.digits + string.ascii_letters + string.whitespace
# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
ParsedPages = collections.namedtuple("ParsedPages", "start end count")
def es_compat_hits_total(resp):
"""
Given a search response dict, support ES6 and ES7 style total value. See:
https://www.elastic.co/guide/en/elasticsearch/reference/current/breaking-changes-7.0.html
It is responsibility of the call site to set `track_total_hits` in ES7 to
get an exact number (https://www.elastic.co/guide/en/elasticsearch/reference/master/search-your-data.html#track-total-hits).
"""
try:
return resp["hits"]["total"]["value"] # ES7
except TypeError:
return resp["hits"]["total"] # ES6
def parse_page_string(s):
"""
Parse typical page strings, e.g. 150-180 or p123.
If only a single page number is found, returns that first page and None for
end page and count. If two are found, and they are consistent as a range,
returns the start, end, and count.
Does not handle lists of page numbers, roman numerals, and several other
patterns.
Returns a named tuple with start, end and count fields.
"""
if not s:
raise ValueError('page parsing: empty string')
if s[0].lower() in ('p', 'e'):
s = s[1:]
if s.isnumeric():
return ParsedPages(start=int(s), end=None, count=None)
page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})")
match = page_pattern.match(s)
if not match:
raise ValueError('cannot parse page pattern from {}'.format(s))
start, end = match.groups()
if len(end) == 1 and start and start[-1] < end:
# '261-5', odd, but happens
end = start[:-1] + end
elif len(end) == 2 and start and start[-2:] < end:
# '577-89', also happens
end = start[:-2] + end
a, b = int(start), int(end)
if a > b:
raise ValueError('invalid page range: {}'.format(s))
count = b - a + 1
return ParsedPages(start=a, end=b, count=count)
def dict_has_key(doc, path):
"""
Return true, if key in a dictionary at a given path exists. XXX: probably
already in glom.
"""
try:
_ = glom(doc, path)
except PathAccessError:
return False
else:
return True
def clean_doi(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
raw = raw.strip().lower()
if raw.startswith("doi:"):
raw = raw[4:]
if not "10." in raw:
return None
if not raw.startswith("10."):
raw = raw[raw.find("10."):]
if raw[7:9] == "//":
raw = raw[:8] + raw[9:]
return raw
def doi_prefix(v):
"""
Return the prefix of a DOI.
"""
parts = v.split("/")
if len(parts) == 1:
raise ValueError("invalid doi: {}".format(v))
return parts[0]
def has_doi_prefix(v, prefix="10.1234"):
"""
Returns False, if we cannot parse v or prefix does not match.
"""
if not v:
return False
return v.split("/")[0] == prefix
def slugify_string(s: str) -> str:
"""
Keeps ascii chars and single whitespace only.
"""
return ' '.join(''.join((c for c in s.lower() if c in printable_no_punct)).split())
def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
"""
Return a callable that extracts a given column from a line.
"""
def func(value):
parts = value.strip().split(sep)
if f >= len(parts):
if ignore_missing_column:
return ""
raise ValueError('cannot split value {} into {} parts'.format(value, f))
return parts[f]
return func
def author_similarity_score(u, v):
"""
Given two author strings, return a similarity score between 0 and 1.
"""
return jaccard(set(token_n_grams(u)), set(token_n_grams(v)))
def jaccard(a, b):
"""
Jaccard of sets a and b.
"""
if len(a | b) == 0:
return 0
return len(a & b) / len(a | b)
def token_n_grams(s, n=2):
"""
Return n-grams, calculated per token.
"""
return ["".join(v) for v in itertools.chain(*[nwise(v, n=n) for v in tokenize_string(s)])]
def tokenize_string(s):
"""
Normalize and tokenize, should be broken up.
"""
return [token for token in s.lower().split()]
def nwise(iterable, n=2):
"""
Generalized: func: `pairwise`. Split an iterable after every
`n` items.
"""
i = iter(iterable)
piece = tuple(itertools.islice(i, n))
while piece:
yield piece
piece = tuple(itertools.islice(i, n))
def num_project(s):
"""
Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq,
https://fatcat.wiki/release/7hgzqz3hrngq7omtwdxz4qx34u
Unify every occurence of a digit (or group of digits).
"""
return re.sub(r'\d+', '<NUM>', s)
def contains_chemical_formula(s):
"""
Returns true, if we find C3H8O or the like in title.
"""
for token in s.split():
if CHEM_FORMULA.search(token):
return True
def random_word(func=lambda w: True, wordsfile='/usr/share/dict/words'):
"""
Requires the UNIX words file in a typical location. Returns a single,
random word.
"""
if not os.path.exists(wordsfile):
raise RuntimeError('file not found: {}'.format(wordsfile))
with open(wordsfile) as f:
words = list(filter(func, (word.strip() for word in f)))
return random.choice(words)
def random_idents_from_query(query="*",
es="https://search.fatcat.wiki/fatcat_release/_search",
r=2):
"""
Return a number of random idents from a search query.
"""
resp = requests.get(es, params={"q": query})
if resp.status_code != 200:
raise RuntimeError('could not query {} for random item: {}'.format(es, r.url))
payload = resp.json()
if es_compat_hits_total(payload) < 2:
raise RuntimeError('to few documents')
idents = [doc["_source"]["ident"] for doc in payload["hits"]["hits"]]
return random.sample(idents, r)
def zstdlines(filename, encoding="utf-8", bufsize=65536):
"""
Generator over lines from a zstd compressed file.
>>> for line in zstdlines("file.zst"):
... print(line)
"""
with open(filename, "rb") as f:
decomp = ZstdDecompressor()
with decomp.stream_reader(f) as reader:
prev_line = ""
while True:
chunk = reader.read(bufsize)
if not chunk:
break
while True:
# We start with bytes but want unicode, which might not
# align; so we jitter around the end to complete the
# codepoint.
try:
string_data = chunk.decode(encoding)
except UnicodeDecodeError:
chunk = chunk + reader.read(1)
else:
break
lines = string_data.split("\n")
for i, line in enumerate(lines[:-1]):
if i == 0:
line = prev_line + line
yield line
prev_line = lines[-1]
def shellout(template,
preserve_whitespace=False,
executable='/bin/bash',
ignoremap=None,
encoding=None,
pipefail=True,
**kwargs):
"""
Takes a shell command template and executes it. The template must use the
new (2.6+) format mini language. `kwargs` must contain any defined
placeholder, only `output` is optional and will be autofilled with a
temporary file if it used, but not specified explicitly.
If `pipefail` is `False` no subshell environment will be spawned, where a
failed pipe will cause an error as well. If `preserve_whitespace` is `True`,
no whitespace normalization is performed. A custom shell executable name can
be passed in `executable` and defaults to `/bin/bash`.
Raises RuntimeError on nonzero exit codes. To ignore certain errors, pass a
dictionary in `ignoremap`, with the error code to ignore as key and a string
message as value.
Simple template:
wc -l < {input} > {output}
Quoted curly braces:
ps ax|awk '{{print $1}}' > {output}
"""
if not 'output' in kwargs:
kwargs.update({'output': tempfile.mkstemp(prefix='gluish-')[1]})
if ignoremap is None:
ignoremap = {}
if encoding:
command = template.decode(encoding).format(**kwargs)
else:
command = template.format(**kwargs)
if not preserve_whitespace:
command = re.sub('[ \t\n]+', ' ', command)
if pipefail:
command = '(set -o pipefail && %s)' % command
code = subprocess.call([command], shell=True, executable=executable)
if not code == 0:
if code not in ignoremap:
error = RuntimeError('%s exitcode: %s' % (command, code))
error.code = code
raise error
return kwargs.get('output')
|