diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-09-24 13:58:51 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-09-24 13:58:51 +0200 |
commit | 478d7d06ad9e56145cb94f3461c355b1ba9eb491 (patch) | |
tree | fa467290e8c8df41a1e97a6de751d0f7e790c9de | |
parent | 86cc3191ce03042ef4a0c6c8a44f4094a140b802 (diff) | |
download | fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.tar.gz fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.zip |
start larger refactoring: remove cluster
background: verifying hundreds of millions of documents turned out to be
a bit slow; anecdata: running clustering and verification over 1.8B
inputs tooks over 50h; cf. the Go port (skate) required about 2-4h for
those operations. Also: with Go we do not need the extra GNU parallel
wrapping.
In any case, we aim for fuzzycat refactoring to provide:
* better, more configurable verification and small scale matching
* removal of batch clustering code (and improve refcat docs)
* a place for a bit more generic, similarity based utils
The most important piece in fuzzycat is a CSV file containing hand
picked test examples for verification - and the code that is able to
fulfill that test suite. We want to make this part more robust.
-rw-r--r-- | fuzzycat/__main__.py | 55 | ||||
-rw-r--r-- | fuzzycat/cluster.py | 454 | ||||
-rw-r--r-- | fuzzycat/matching.py | 2 | ||||
-rw-r--r-- | fuzzycat/sandcrawler.py | 158 | ||||
-rw-r--r-- | fuzzycat/utils.py | 15 | ||||
-rw-r--r-- | fuzzycat/verify.py | 16 | ||||
-rw-r--r-- | tests/test_cluster.py | 189 | ||||
-rw-r--r-- | tests/test_matching.py | 6 | ||||
-rw-r--r-- | tests/test_utils.py | 16 |
9 files changed, 188 insertions, 723 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py index 10c856d..7792df6 100644 --- a/fuzzycat/__main__.py +++ b/fuzzycat/__main__.py @@ -3,23 +3,14 @@ COMMANDS - cluster verify verify_single verify_ref release_match unstructured - Run, e.g. fuzzycat cluster --help for more options. - EXAMPLES - Clustering with GNU parallel. - - $ zstdcat -T0 release_export_expanded.json.zst | - parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 | - python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl - Bulk verification. $ zstdcat -T0 cluster_tsandcrawler.json.zst | @@ -67,9 +58,6 @@ import elasticsearch import requests from fatcat_openapi_client import ReleaseEntity -from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram, - release_key_title_normalized, release_key_title_nysiis, - release_key_title_sandcrawler) from fuzzycat.entities import entity_to_dict from fuzzycat.grobid_unstructured import grobid_parse_unstructured from fuzzycat.matching import anything_to_entity, match_release_fuzzy @@ -82,32 +70,6 @@ logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) -def run_cluster(args): - """ - Run clustering over release entities from database dump. - """ - logger = logging.getLogger('main.run_cluster') - types = { - 'title': release_key_title, - 'tnorm': release_key_title_normalized, - 'tnysi': release_key_title_nysiis, - 'tss': release_key_title_ngram, - 'tsandcrawler': release_key_title_sandcrawler, - } - key_denylist = None - if args.key_denylist: - with open(args.key_denylist, 'r') as f: - key_denylist = [l.strip() for l in f.readlines()] - cluster = Cluster(iterable=fileinput.input(args.files), - key=types.get(args.type), - tmpdir=args.tmpdir, - compress=args.compress, - key_denylist=key_denylist, - prefix=args.prefix) - cluster.run() - logger.debug(json.dumps(dict(cluster.counter))) - - def run_verify(args): """ Run match verification over dataset from clustering step. @@ -253,23 +215,6 @@ if __name__ == '__main__': parser.add_argument("-v", "--verbose", help="be verbose", action='store_true') subparsers = parser.add_subparsers() - sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser]) - sub_cluster.set_defaults(func=run_cluster) - sub_cluster.add_argument('-C', - '--compress', - action="store_true", - help='compress intermediate results') - sub_cluster.add_argument('-f', '--files', default="-", help='input files') - sub_cluster.add_argument('--key-denylist', help='file path to key denylist') - sub_cluster.add_argument('--min-cluster-size', - default=2, - type=int, - help='ignore smaller clusters') - sub_cluster.add_argument('-t', - '--type', - default='title', - help='cluster algorithm: title, tnorm, tnysi, tss, tsandcrawler') - sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser]) sub_verify.add_argument('-f', '--files', default="-", help='input files') sub_verify.add_argument('--max-cluster-size', diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py deleted file mode 100644 index 7994be7..0000000 --- a/fuzzycat/cluster.py +++ /dev/null @@ -1,454 +0,0 @@ -# pylint: disable=C0103 -""" -Clustering stage. - -* [x] verify needs whole document -* [ ] parallelization misses groups -* [ ] cached match key store (tsv, sqlite3), something ~/.cache/... -* [x] reproducibly run tests -* [x] place for put md/tsv record tests - ----- - -* [x] hadoop -> py (bn) -* [ ] gnu parallel, share command line -- note (bn) - ----- - -Ideas: - -* lookup potential matches; TSV [key, ...]; sort -* maybe new "schema" - size vs "common schema" -- key <TAB> {"bibjson": ...} -* merge-join - -``` -$ python -m fuzzycat keygen -s "algo" < ours | sort -k1,1 > a.tsv -$ python -m fuzzycat keygen -s "algo" < other | sort -k1,1 > b.tsv -$ merge-join a.tsv b.tsv -``` - -A couple of "keygen" algos. - -> 10k/s, 1B, ~day - -Partial fields should be ok. - -Q: - -* nysiis - -Deps. - -* pydantic; json "omitempty" -- get rid of it? -* orjson (serialize datetime) -- maybe enough w/ dataclasses w/ dataclasses - -fuzzycat.main -> `__main__.py` - -* elasticsearch-py >> elasticsearch - -Matching releases to non-release entities. - ----- - -Features and integration. - -* work grouping at import time; random pdfs; requires strong verification (vs refcat) -* email out to OCI - -""" - -import collections -import itertools -import json -import multiprocessing -import operator -import os -import re -import subprocess -import sys -import tempfile -import unicodedata -from dataclasses import dataclass -from typing import IO, Any, Callable, Dict, Generator, List, Optional, Tuple - -import jellyfish -import regex -import zstandard - -from fuzzycat.utils import cut, shellout, slugify_string, zstdlines - -__all__ = [ - "release_key_title", - "release_key_title_normalized", - "release_key_title_nysiis", - "release_key_title_sandcrawler", - "Cluster", -] - - -@dataclass -class KeyDoc: - """ - A document from which we can derive a key, e.g. a release entity. - """ - ident: str - title: str - - -get_ident_title = operator.itemgetter("ident", "title") -ws_replacer = str.maketrans({"\t": " ", "\n": " "}) -non_word_re = re.compile(r'[\W_]+', re.UNICODE) - -# Notes: untie from release_entity, as we are only using a few fields. Maybe -# it's a jsob blob, with a pydantic spec and schema. - - -def release_key_title(doc: KeyDoc) -> Tuple[str, str]: - ident, title = get_ident_title(doc) - if not title: - raise ValueError('title missing for {}'.format(ident)) - title = title.translate(ws_replacer).strip() - return (ident, title) - - -def release_key_title_normalized(doc: KeyDoc) -> Tuple[str, str]: - ident, title = release_key_title(doc) - title = re.sub(r'[ ]{2,}', ' ', title).lower() - return (ident, non_word_re.sub('', title)) - - -def release_key_title_nysiis(doc: KeyDoc) -> Tuple[str, str]: - """ - Use NYSIIS New York State Identification and Intelligence System. - """ - ident, title = release_key_title(doc) - return (ident, jellyfish.nysiis(title)) - - -# from http://zderadicka.eu/removing-diacritics-marks-from-strings/ -SANDCRAWLER_CHAR_MAP = { - '\N{Latin capital letter AE}': 'AE', - '\N{Latin small letter ae}': 'ae', - '\N{Latin capital letter Eth}': 'D', - '\N{Latin small letter eth}': 'd', - '\N{Latin capital letter O with stroke}': 'O', - '\N{Latin small letter o with stroke}': 'o', - '\N{Latin capital letter Thorn}': 'Th', - '\N{Latin small letter thorn}': 'th', - '\N{Latin small letter sharp s}': 's', - '\N{Latin capital letter D with stroke}': 'D', - '\N{Latin small letter d with stroke}': 'd', - '\N{Latin capital letter H with stroke}': 'H', - '\N{Latin small letter h with stroke}': 'h', - '\N{Latin small letter dotless i}': 'i', - '\N{Latin small letter kra}': 'k', - '\N{Latin capital letter L with stroke}': 'L', - '\N{Latin small letter l with stroke}': 'l', - '\N{Latin capital letter Eng}': 'N', - '\N{Latin small letter eng}': 'n', - '\N{Latin capital ligature OE}': 'Oe', - '\N{Latin small ligature oe}': 'oe', - '\N{Latin capital letter T with stroke}': 'T', - '\N{Latin small letter t with stroke}': 't', - - # bnewbold additions; mostly Latin-ish OCR ambiguous - '\N{MICRO SIGN}': 'u', - '\N{LATIN SMALL LETTER C}': 'c', - '\N{LATIN SMALL LETTER F WITH HOOK}': 'f', - '\N{Greek Small Letter Alpha}': 'a', - '\N{Greek Small Letter Beta}': 'b', - '\N{Greek Small Letter Iota}': 'i', - '\N{Greek Small Letter Kappa}': 'k', - '\N{Greek Small Letter Chi}': 'x', - '\N{Greek Small Letter Upsilon}': 'u', - '\N{Greek Small Letter Nu}': 'v', - '\N{Greek Small Letter Gamma}': 'y', - '\N{Greek Small Letter Tau}': 't', - '\N{Greek Small Letter Omicron}': 'o', - # bnewbold map-to-null (for non-printing stuff not in the regex) - '\N{PARTIAL DIFFERENTIAL}': '', - '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '', - '\N{N-ARY SUMMATION}': '', - '\N{N-ARY PRODUCT}': '', - '\N{MODIFIER LETTER CIRCUMFLEX ACCENT}': '', - '\N{SNOWMAN}': '', - '\N{CARON}': '', -} - -SANDCRAWLER_PREFIX_REMOVE = [ - "original article: ", - "original article ", - "article: ", - "title: ", -] - -# regex that matches all characters which should be removed -SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile( - r"[\s\p{Punctuation}\p{M}\p{InCombiningDiacriticalMarks}\u2000-\u206F\u2E00-\u2E7F’·“”‘’“”«»「」¿–±§_`°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]" -) - - -def sandcrawler_slugify(raw: str) -> str: - """ - Python re-implementation of sandcrawler Scala code for string comparison - ("scorable" strings) - """ - slug = raw.strip().lower() - - # transforms before running regex - for prefix in SANDCRAWLER_PREFIX_REMOVE: - if slug.startswith(prefix): - slug = slug[:len(prefix)] - - slug = slug.replace("'", "'") - - # iterate over all chars and replace from map, if in map; then lower-case again - slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]).lower() - - # early bailout before executing regex - if not slug: - return "" - - slug = unicodedata.normalize('NFKD', slug) - slug = SANDCRAWLER_REMOVE_CHAR_REGEX.sub('', slug) - - return slug.lower() - - -def test_sandcrawler_slugify() -> None: - test_cases = [ - ("", ""), - ("asdf", "asdf"), - ("'Hello World!'", "helloworld"), - ("ASDF", "asdf"), - ("as\n df", "asdf"), - ("as\u0142 bb \u00f8", "aslbbo"), - ("`hello¿", "hello"), - ("علمية", "علمية"), - ("期刊的数字", "期刊的数字"), - ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"), - ("γ-Globulin", "yglobulin"), - - # "MICRO SIGN" - ("\xb5meter", "umeter"), - # "GREEK SMALL LETTER MU" - ("\u03bcmeter", "\u03bcmeter"), - - # TODO: ("salt ∧ pepper", "saltpepper"), - # TODO: ("new <b>and</b> improved", "newandimproved"), - - # some via https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt - ("-9223372036854775808/-1", "92233720368547758081"), - (r",./;'[]\-= <>?:\"{}|_+ !@#$%^&*()`~", ""), - (" \n\r \x85 \u1680\u2002\u2003\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u202f\u205f\u3000", - ""), - (r"Ω≈ç√∫˜≤≥÷", "ωc"), - (r"åß∂ƒ©˙∆˚¬…æ", "asfae"), - (r"œ∑´®†¥¨ˆøπ“‘", "oeoπ"), - (r"¡™£¢∞§¶•ªº–≠ ", "tmao"), - (r"¸˛Ç◊ı˜Â¯˘¿", "cia"), - (r"ÅÍÎÏ˝ÓÔÒÚÆ☃", "aiiiooouae"), - (r"Œ„´‰ˇÁ¨ˆØ∏”’", "oeao"), - (r"`⁄€‹›fifl‡°·‚—±", "fifl"), - (r"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя", - "еђгєѕііјљњћкиуџабвгдежзииклмнопрстуфхцчшщъыьэюяабвгдежзииклмнопрстуфхцчшщъыьэюя"), - (r"⁰⁴⁵₀₁₂", "045012"), - (r"社會科學院語學研究所", "社會科學院語學研究所"), - # TODO: ("パーティーへ行かないか", "パーティーへ行かないか"), - # TODO: ("表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀", "表ポあa鷗oeebB逍usaan丂㐀𠀀"), - (r"( ͡° ͜ʖ ͡°)", ""), - # emoji ok? I guess - (r"👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"), - (r"2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"), - (r"﷽ ", "﷽"), - (r"\"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟", - "thenezperdianhivemindofchaoszalgo"), - (r"The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"), - (r"The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"), - (r"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"), - ] - - for in_str, out_str in test_cases: - if sandcrawler_slugify(in_str) != out_str: - for c in list(sandcrawler_slugify(in_str)): - try: - print(unicodedata.name(c)) - except ValueError: - print(ord(c)) - #print(ord(c)) - print("----") - for c in list(out_str): - print(unicodedata.name(c)) - print(in_str) - assert sandcrawler_slugify(in_str) == out_str - - -def release_key_title_sandcrawler(doc: KeyDoc) -> Tuple[str, str]: - ident, title = release_key_title(doc) - slug = sandcrawler_slugify(title) - return (ident, slug) - - -def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]: - """ - Derive a key from title. - - Tokenize title, remote stopwords, lookup first three, lookup last three, - plus authors. TODO(miku): authors. - """ - ident, title = get_ident_title(doc) - slug_title = slugify_string(title) - tokens = slug_title.split() - if len(tokens) < 2 * n: - key = ''.join(tokens) - else: - key = ''.join(tokens[:3] + tokens[-3:]) - return (ident, key) - - -class Cluster: - """ - Setup and run clustering over a potentially large (100m) number of records. - - Two main options are iterable (TODO: work on parsed docs), and the key - function to apply to value to group by. - - TODO: We want compression. - """ - def __init__(self, - iterable: collections.abc.Iterable, - key: Callable[[Any], Tuple[str, str]], - output: IO[str] = sys.stdout, - key_denylist: Optional[List[str]] = None, - prefix: str = "fuzzycat-", - tmpdir: str = tempfile.gettempdir(), - strict: bool = False, - min_cluster_size: int = 2, - max_cluster_size: int = 100, - compress=False, - verbose=True): - self.iterable: collections.abc.Iterable = iterable - self.key: Callable[[Any], Tuple[str, str]] = key - self.output: IO[str] = output - self.prefix: str = prefix - self.tmpdir: str = tmpdir - self.strict = strict - self.key_denylist = key_denylist - self.min_cluster_size = min_cluster_size - self.max_cluster_size = max_cluster_size - self.verbose = verbose - self.compress = compress - self.counter: Dict[str, int] = collections.Counter({ - "key_fail": 0, - "key_ok": 0, - "key_empty": 0, - "key_denylist": 0, - "num_clusters": 0, - }) - - def run(self): - """ - First map documents to keys, then group by keys, outline: json -> tsv - -> sort -> group -> json. - """ - with tempfile.NamedTemporaryFile(delete=False, mode="wb", prefix=self.prefix) as tf: - if self.compress: - zc = zstandard.ZstdCompressor(level=9, threads=multiprocessing.cpu_count()) - writer = zc.stream_writer(tf) - else: - writer = tf - for i, line in enumerate(self.iterable): - if self.verbose and i % 100000 == 0: - print("@{}".format(i), file=sys.stderr) - try: - doc = json.loads(line) - id, key = self.key(doc) - except (KeyError, ValueError): - if self.strict: - raise - self.counter["key_fail"] += 1 - continue - if not key: - self.counter["key_empty"] += 1 - continue - if self.key_denylist and key in self.key_denylist: - self.counter["key_denylist"] += 1 - continue - self.counter["key_ok"] += 1 - # XXX: if the line itself contains tabs, we need to remove - # them here; maybe offer TSV and JSON output and extra flag - # XXX: this needs to be compressed (e.g. with 2B records, we - # fill up disk too quickly) - data = bytes("{}\t{}\t{}\n".format(id, key, - line.replace("\t", " ").strip()), - encoding="utf-8") - writer.write(data) - if self.compress: - writer.flush(zstandard.FLUSH_FRAME) - - sf = self.sort(tf.name, opts='-k 2') - if self.compress: - f = zstdlines(sf) - else: - f = open(sf) - - for doc in self.group_by(f, key=cut(f=1)): - if len(doc["v"]) < self.min_cluster_size: - continue - self.counter["num_clusters"] += 1 - json.dump(doc, self.output) - self.output.write("\n") - - os.remove(sf) - os.remove(tf.name) - return self.counter - - def sort(self, filename: str, opts: str = "-k 2", fast: bool = True, mode: str = "w"): - """ - Sort tabular file with sort(1), returns the filename of the sorted - file. Options to sort can be passed in via opts keyword argument. - """ - with tempfile.NamedTemporaryFile(delete=False, mode=mode, prefix=self.prefix) as tf: - env = os.environ.copy() - env["TMPDIR"] = self.tmpdir - if fast: - env["LC_ALL"] = "C" - if self.compress: - output = shellout( - "zstdcat -T0 {input} | LC_ALL=C TMPDIR={tmpdir} sort {opts} | zstd -T0 -c9 > {output}", - input=filename, - tmpdir=self.tmpdir, - opts=opts) - else: - subprocess.run(["sort"] + opts.split() + [filename], stdout=tf, env=env, check=True) - output = tf.name - - return output - - def group_by(self, - seq: collections.abc.Iterable, - key: Callable[[Any], str] = None) -> Generator[Any, None, None]: - """ - Extract a key from elements of an iterable and group them. Just as - uniq(1), the input iterable must be ordered (by the key that is - extracted) for this to work. - - There might be large clusters, which would currently exceed memory, - hence the max_cluster_size option. - """ - for k, g in itertools.groupby(seq, key=key): - payload = [] - for i, line in enumerate(g): - if i > 0 and i == self.max_cluster_size: - print('max cluster size cut off for: {}'.format(k), file=sys.stderr) - break - # XXX: This is a bit too much "serde", get rid of this. - fields = line.split("\t") - if len(fields) < 3: - continue - payload.append(json.loads(fields[2])) - doc = { - "k": k.strip(), - "v": payload, - } - yield doc diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py index 310dfc2..bcda46d 100644 --- a/fuzzycat/matching.py +++ b/fuzzycat/matching.py @@ -73,7 +73,6 @@ def match_release_fuzzy( if r: return [r] - if release.title is not None and release.contribs is not None: names = " ".join([c.raw_name for c in release.contribs]) body = { @@ -178,7 +177,6 @@ def match_release_fuzzy( if es_compat_hits_total(resp) > 0: return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api) - # TODO: perform more queries on other fields. return [] diff --git a/fuzzycat/sandcrawler.py b/fuzzycat/sandcrawler.py new file mode 100644 index 0000000..958756a --- /dev/null +++ b/fuzzycat/sandcrawler.py @@ -0,0 +1,158 @@ +import regex +import unicodedata + +# from http://zderadicka.eu/removing-diacritics-marks-from-strings/ +SANDCRAWLER_CHAR_MAP = { + '\N{Latin capital letter AE}': 'AE', + '\N{Latin small letter ae}': 'ae', + '\N{Latin capital letter Eth}': 'D', + '\N{Latin small letter eth}': 'd', + '\N{Latin capital letter O with stroke}': 'O', + '\N{Latin small letter o with stroke}': 'o', + '\N{Latin capital letter Thorn}': 'Th', + '\N{Latin small letter thorn}': 'th', + '\N{Latin small letter sharp s}': 's', + '\N{Latin capital letter D with stroke}': 'D', + '\N{Latin small letter d with stroke}': 'd', + '\N{Latin capital letter H with stroke}': 'H', + '\N{Latin small letter h with stroke}': 'h', + '\N{Latin small letter dotless i}': 'i', + '\N{Latin small letter kra}': 'k', + '\N{Latin capital letter L with stroke}': 'L', + '\N{Latin small letter l with stroke}': 'l', + '\N{Latin capital letter Eng}': 'N', + '\N{Latin small letter eng}': 'n', + '\N{Latin capital ligature OE}': 'Oe', + '\N{Latin small ligature oe}': 'oe', + '\N{Latin capital letter T with stroke}': 'T', + '\N{Latin small letter t with stroke}': 't', + + # bnewbold additions; mostly Latin-ish OCR ambiguous + '\N{MICRO SIGN}': 'u', + '\N{LATIN SMALL LETTER C}': 'c', + '\N{LATIN SMALL LETTER F WITH HOOK}': 'f', + '\N{Greek Small Letter Alpha}': 'a', + '\N{Greek Small Letter Beta}': 'b', + '\N{Greek Small Letter Iota}': 'i', + '\N{Greek Small Letter Kappa}': 'k', + '\N{Greek Small Letter Chi}': 'x', + '\N{Greek Small Letter Upsilon}': 'u', + '\N{Greek Small Letter Nu}': 'v', + '\N{Greek Small Letter Gamma}': 'y', + '\N{Greek Small Letter Tau}': 't', + '\N{Greek Small Letter Omicron}': 'o', + # bnewbold map-to-null (for non-printing stuff not in the regex) + '\N{PARTIAL DIFFERENTIAL}': '', + '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '', + '\N{N-ARY SUMMATION}': '', + '\N{N-ARY PRODUCT}': '', + '\N{MODIFIER LETTER CIRCUMFLEX ACCENT}': '', + '\N{SNOWMAN}': '', + '\N{CARON}': '', +} + +SANDCRAWLER_PREFIX_REMOVE = [ + "original article: ", + "original article ", + "article: ", + "title: ", +] + +# regex that matches all characters which should be removed +SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile( + r"[\s\p{Punctuation}\p{M}\p{InCombiningDiacriticalMarks}\u2000-\u206F\u2E00-\u2E7F’·“”‘’“”«»「」¿–±§_`°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]" +) + +def sandcrawler_slugify(raw: str) -> str: + """ + Python re-implementation of sandcrawler Scala code for string comparison + ("scorable" strings) + """ + slug = raw.strip().lower() + + # transforms before running regex + for prefix in SANDCRAWLER_PREFIX_REMOVE: + if slug.startswith(prefix): + slug = slug[:len(prefix)] + + slug = slug.replace("'", "'") + + # iterate over all chars and replace from map, if in map; then lower-case again + slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]).lower() + + # early bailout before executing regex + if not slug: + return "" + + slug = unicodedata.normalize('NFKD', slug) + slug = SANDCRAWLER_REMOVE_CHAR_REGEX.sub('', slug) + + return slug.lower() + + +def test_sandcrawler_slugify() -> None: + test_cases = [ + ("", ""), + ("asdf", "asdf"), + ("'Hello World!'", "helloworld"), + ("ASDF", "asdf"), + ("as\n df", "asdf"), + ("as\u0142 bb \u00f8", "aslbbo"), + ("`hello¿", "hello"), + ("علمية", "علمية"), + ("期刊的数字", "期刊的数字"), + ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"), + ("γ-Globulin", "yglobulin"), + + # "MICRO SIGN" + ("\xb5meter", "umeter"), + # "GREEK SMALL LETTER MU" + ("\u03bcmeter", "\u03bcmeter"), + + # TODO: ("salt ∧ pepper", "saltpepper"), + # TODO: ("new <b>and</b> improved", "newandimproved"), + + # some via https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt + ("-9223372036854775808/-1", "92233720368547758081"), + (r",./;'[]\-= <>?:\"{}|_+ !@#$%^&*()`~", ""), + (" \n\r \x85 \u1680\u2002\u2003\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u202f\u205f\u3000", + ""), + (r"Ω≈ç√∫˜≤≥÷", "ωc"), + (r"åß∂ƒ©˙∆˚¬…æ", "asfae"), + (r"œ∑´®†¥¨ˆøπ“‘", "oeoπ"), + (r"¡™£¢∞§¶•ªº–≠ ", "tmao"), + (r"¸˛Ç◊ı˜Â¯˘¿", "cia"), + (r"ÅÍÎÏ˝ÓÔÒÚÆ☃", "aiiiooouae"), + (r"Œ„´‰ˇÁ¨ˆØ∏”’", "oeao"), + (r"`⁄€‹›fifl‡°·‚—±", "fifl"), + (r"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя", + "еђгєѕііјљњћкиуџабвгдежзииклмнопрстуфхцчшщъыьэюяабвгдежзииклмнопрстуфхцчшщъыьэюя"), + (r"⁰⁴⁵₀₁₂", "045012"), + (r"社會科學院語學研究所", "社會科學院語學研究所"), + # TODO: ("パーティーへ行かないか", "パーティーへ行かないか"), + # TODO: ("表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀", "表ポあa鷗oeebB逍usaan丂㐀𠀀"), + (r"( ͡° ͜ʖ ͡°)", ""), + # emoji ok? I guess + (r"👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"), + (r"2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"), + (r"﷽ ", "﷽"), + (r"\"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟", + "thenezperdianhivemindofchaoszalgo"), + (r"The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"), + (r"The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"), + (r"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"), + ] + + for in_str, out_str in test_cases: + if sandcrawler_slugify(in_str) != out_str: + for c in list(sandcrawler_slugify(in_str)): + try: + print(unicodedata.name(c)) + except ValueError: + print(ord(c)) + print("----") + for c in list(out_str): + print(unicodedata.name(c)) + print(in_str) + assert sandcrawler_slugify(in_str) == out_str + diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 303daf6..24e103a 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -26,12 +26,12 @@ def es_compat_hits_total(resp): https://www.elastic.co/guide/en/elasticsearch/reference/current/breaking-changes-7.0.html It is responsibility of the call site to set `track_total_hits` in ES7 to - get an exact number. + get an exact number (https://www.elastic.co/guide/en/elasticsearch/reference/master/search-your-data.html#track-total-hits). """ try: - return resp["hits"]["total"]["value"] + return resp["hits"]["total"]["value"] # ES7 except TypeError: - return resp["hits"]["total"] + return resp["hits"]["total"] # ES6 def parse_page_string(s): @@ -44,6 +44,8 @@ def parse_page_string(s): Does not handle lists of page numbers, roman numerals, and several other patterns. + + Returns a named tuple with start, end and count fields. """ if not s: raise ValueError('page parsing: empty string') @@ -69,7 +71,7 @@ def parse_page_string(s): return ParsedPages(start=a, end=b, count=count) -def dict_key_exists(doc, path): +def dict_has_key(doc, path): """ Return true, if key in a dictionary at a given path exists. XXX: probably already in glom. @@ -101,7 +103,10 @@ def doi_prefix(v): """ Return the prefix of a DOI. """ - return v.split("/")[0] + parts = v.split("/") + if len(parts) == 1: + raise ValueError("invalid doi: {}".format(v)) + return parts[0] def has_doi_prefix(v, prefix="10.1234"): diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 5b90c47..9eb808b 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -91,7 +91,7 @@ from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_ TITLE_FRAGMENT_BLACKLIST) from fuzzycat.entities import entity_to_dict from fuzzycat.utils import (author_similarity_score, clean_doi, contains_chemical_formula, - dict_key_exists, doi_prefix, has_doi_prefix, jaccard, num_project, + dict_has_key, doi_prefix, has_doi_prefix, jaccard, num_project, parse_page_string, slugify_string) Verify = collections.namedtuple("Verify", "status reason") @@ -233,10 +233,10 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]: if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"): if a_doi + "u" == b_doi or b_doi + "u" == a_doi: return Verify(Status.STRONG, Reason.CUSTOM_BSI_UNDATED) - if a_title == b_title and ((dict_key_exists(a, "extra.subtitle") - and not dict_key_exists(b, "extra.subtitle")) or - (dict_key_exists(b, "extra.subtitle") - and not dict_key_exists(a, "extra.subtitle"))): + if a_title == b_title and ((dict_has_key(a, "extra.subtitle") + and not dict_has_key(b, "extra.subtitle")) or + (dict_has_key(b, "extra.subtitle") + and not dict_has_key(a, "extra.subtitle"))): return Verify(Status.STRONG, Reason.CUSTOM_BSI_SUBDOC) except PathAccessError: pass @@ -301,7 +301,7 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]: # beware: we have versions and "isPartOf", e.g. # https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4 # Datacite md schema: https://doi.org/10.14454/7xq3-zf69 - if dict_key_exists(a, "extra.datacite") and dict_key_exists(b, "extra.datacite"): + if dict_has_key(a, "extra.datacite") and dict_has_key(b, "extra.datacite"): whitelist = set([ "HasPart", "HasVersion", @@ -511,8 +511,8 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]: # if any([a_authors, b_authors]) and not (a_authors and b_authors): # Does not cover case, where both authors are empty. if a_release_year == b_release_year and a_title_lower == b_title_lower: - if ((dict_key_exists(a, "ext_ids.pmid") and dict_key_exists(b, "ext_ids.doi")) - or (dict_key_exists(b, "ext_ids.pmid") and dict_key_exists(a, "ext_ids.doi"))): + if ((dict_has_key(a, "ext_ids.pmid") and dict_has_key(b, "ext_ids.doi")) + or (dict_has_key(b, "ext_ids.pmid") and dict_has_key(a, "ext_ids.doi"))): return Verify(Status.STRONG, Reason.PMID_DOI_PAIR) # Two JSTOR items will probably be different. diff --git a/tests/test_cluster.py b/tests/test_cluster.py deleted file mode 100644 index 55b349a..0000000 --- a/tests/test_cluster.py +++ /dev/null @@ -1,189 +0,0 @@ -import collections -import io -import json -import os -import tempfile - -import pytest - -from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized, - release_key_title_nysiis) - -Case = collections.namedtuple("Case", 'input output') - - -def test_release_key_title(): - with pytest.raises(KeyError): - release_key_title({}) - with pytest.raises(KeyError, match='title'): - release_key_title({'ident': '123'}) - with pytest.raises(KeyError, match='ident'): - release_key_title({'title': 'deep learning backdoor'}) - with pytest.raises(ValueError, match='title.*missing'): - release_key_title({'ident': '', 'title': ''}) - cases = ( - Case(input={ - 'ident': '', - 'title': 'simhash' - }, output=('', 'simhash')), - Case(input={ - 'ident': '', - 'title': 'Simhash' - }, output=('', 'Simhash')), - Case(input={ - 'ident': '', - 'title': 'Sim hash' - }, output=('', 'Sim hash')), - ) - for case in cases: - assert case.output == release_key_title(case.input) - - -def test_release_key_title_normalized(): - cases = ( - Case(input={ - 'ident': '', - 'title': 'simhash' - }, output=('', 'simhash')), - Case(input={ - 'ident': '', - 'title': 'Simhash' - }, output=('', 'simhash')), - Case(input={ - 'ident': '', - 'title': 'Sim hash' - }, output=('', 'simhash')), - Case(input={ - 'ident': '', - 'title': 'THE year 1929' - }, output=('', 'theyear1929')), - Case(input={ - 'ident': '', - 'title': '2019?' - }, output=('', '2019')), - Case(input={ - 'ident': '123', - 'title': 'H~~2019?' - }, output=('123', 'h2019')), - ) - for case in cases: - assert case.output == release_key_title_normalized(case.input), 'failed case {}'.format( - case.input) - - -def test_release_key_title_nysiis(): - cases = ( - Case(input={ - 'ident': '', - 'title': 'simhash' - }, output=('', 'SANM')), - Case(input={ - 'ident': '', - 'title': 'Simhash' - }, output=('', 'SANM')), - Case(input={ - 'ident': '', - 'title': 'Sim hash' - }, output=('', 'SAN')), - Case(input={ - 'ident': '', - 'title': 'THE year 1929' - }, output=('', 'T')), - Case(input={ - 'ident': '', - 'title': '2019?' - }, output=('', '2019?')), - Case(input={ - 'ident': '123', - 'title': 'H~~2019?' - }, output=('123', 'H~2019?')), - Case(input={ - 'ident': '123', - 'title': '世界' - }, output=('123', '世界')), - ) - for case in cases: - assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format( - case.input) - - -def test_cluster(): - sio = io.StringIO() - lines = [ - json.dumps(doc) for doc in [ - { - "title": "hello world", - "ident": 1, - }, - { - "title": "hello world!", - "ident": 2, - }, - ] - ] - cluster = Cluster(lines, release_key_title_normalized, output=sio) - stats = cluster.run() - assert stats == { - "key_fail": 0, - "key_ok": 2, - "key_empty": 0, - "key_denylist": 0, - "num_clusters": 1 - } - assert json.loads(sio.getvalue()) == { - "k": "helloworld", - "v": [{ - "title": "hello world!", - "ident": 2 - }, { - "title": "hello world", - "ident": 1 - }] - } - - sio = io.StringIO() - cluster = Cluster([ - json.dumps(line) for line in [ - { - "title": "hello world", - "ident": 1 - }, - { - "title": "hello world!", - "ident": 2 - }, - { - "title": "other", - "ident": 3 - }, - ] - ], - release_key_title_normalized, - min_cluster_size=1, - output=sio) - stats = cluster.run() - assert stats == { - "key_fail": 0, - "key_ok": 3, - "key_empty": 0, - "key_denylist": 0, - "num_clusters": 2 - } - assert [json.loads(line) for line in sio.getvalue().split("\n") if line] == [{ - "k": - "helloworld", - "v": [{ - "title": "hello world!", - "ident": 2 - }, { - "title": "hello world", - "ident": 1 - }] - }, { - 'k': - 'other', - 'v': [{ - 'ident': 3, - 'title': 'other' - }] - }] diff --git a/tests/test_matching.py b/tests/test_matching.py index 90d1fee..ad971a5 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -81,7 +81,9 @@ def test_match_release_fuzzy(es_client, caplog): }, 2), ({ "title": "", - "contribs": [{"raw_name": "Aristoteles"}], + "contribs": [{ + "raw_name": "Aristoteles" + }], "ext_ids": {} }, 5), # ({ @@ -102,5 +104,5 @@ def test_match_release_fuzzy(es_client, caplog): result = match_release_fuzzy(entity, es=es_client) with caplog.at_level(logging.INFO): logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result), - [v.title for v in result])) + [v.title for v in result])) assert len(result) == count, doc diff --git a/tests/test_utils.py b/tests/test_utils.py index 957203f..b2242b8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,7 +2,7 @@ import pytest import os from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string, - token_n_grams, tokenize_string, parse_page_string, dict_key_exists, + token_n_grams, tokenize_string, parse_page_string, dict_has_key, zstdlines, es_compat_hits_total, clean_doi) @@ -67,13 +67,13 @@ def test_nwise(): assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)] -def test_dict_key_exists(): - assert dict_key_exists({}, "") is False - assert dict_key_exists({"a": "a"}, "a") == True - assert dict_key_exists({"a": "a"}, "b") == False - assert dict_key_exists({"a": {"b": "c"}}, "a.b") == True - assert dict_key_exists({"a": {"b": None}}, "a.b") == True - assert dict_key_exists({"a": {"b": "c"}}, "a.b.c") == False +def test_dict_has_key(): + assert dict_has_key({}, "") is False + assert dict_has_key({"a": "a"}, "a") == True + assert dict_has_key({"a": "a"}, "b") == False + assert dict_has_key({"a": {"b": "c"}}, "a.b") == True + assert dict_has_key({"a": {"b": None}}, "a.b") == True + assert dict_has_key({"a": {"b": "c"}}, "a.b.c") == False def test_page_page_string(): |