aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-09-24 13:58:51 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-09-24 13:58:51 +0200
commit478d7d06ad9e56145cb94f3461c355b1ba9eb491 (patch)
treefa467290e8c8df41a1e97a6de751d0f7e790c9de
parent86cc3191ce03042ef4a0c6c8a44f4094a140b802 (diff)
downloadfuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.tar.gz
fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.zip
start larger refactoring: remove cluster
background: verifying hundreds of millions of documents turned out to be a bit slow; anecdata: running clustering and verification over 1.8B inputs tooks over 50h; cf. the Go port (skate) required about 2-4h for those operations. Also: with Go we do not need the extra GNU parallel wrapping. In any case, we aim for fuzzycat refactoring to provide: * better, more configurable verification and small scale matching * removal of batch clustering code (and improve refcat docs) * a place for a bit more generic, similarity based utils The most important piece in fuzzycat is a CSV file containing hand picked test examples for verification - and the code that is able to fulfill that test suite. We want to make this part more robust.
-rw-r--r--fuzzycat/__main__.py55
-rw-r--r--fuzzycat/cluster.py454
-rw-r--r--fuzzycat/matching.py2
-rw-r--r--fuzzycat/sandcrawler.py158
-rw-r--r--fuzzycat/utils.py15
-rw-r--r--fuzzycat/verify.py16
-rw-r--r--tests/test_cluster.py189
-rw-r--r--tests/test_matching.py6
-rw-r--r--tests/test_utils.py16
9 files changed, 188 insertions, 723 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py
index 10c856d..7792df6 100644
--- a/fuzzycat/__main__.py
+++ b/fuzzycat/__main__.py
@@ -3,23 +3,14 @@
COMMANDS
- cluster
verify
verify_single
verify_ref
release_match
unstructured
- Run, e.g. fuzzycat cluster --help for more options.
-
EXAMPLES
- Clustering with GNU parallel.
-
- $ zstdcat -T0 release_export_expanded.json.zst |
- parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 |
- python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl
-
Bulk verification.
$ zstdcat -T0 cluster_tsandcrawler.json.zst |
@@ -67,9 +58,6 @@ import elasticsearch
import requests
from fatcat_openapi_client import ReleaseEntity
-from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram,
- release_key_title_normalized, release_key_title_nysiis,
- release_key_title_sandcrawler)
from fuzzycat.entities import entity_to_dict
from fuzzycat.grobid_unstructured import grobid_parse_unstructured
from fuzzycat.matching import anything_to_entity, match_release_fuzzy
@@ -82,32 +70,6 @@ logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
-def run_cluster(args):
- """
- Run clustering over release entities from database dump.
- """
- logger = logging.getLogger('main.run_cluster')
- types = {
- 'title': release_key_title,
- 'tnorm': release_key_title_normalized,
- 'tnysi': release_key_title_nysiis,
- 'tss': release_key_title_ngram,
- 'tsandcrawler': release_key_title_sandcrawler,
- }
- key_denylist = None
- if args.key_denylist:
- with open(args.key_denylist, 'r') as f:
- key_denylist = [l.strip() for l in f.readlines()]
- cluster = Cluster(iterable=fileinput.input(args.files),
- key=types.get(args.type),
- tmpdir=args.tmpdir,
- compress=args.compress,
- key_denylist=key_denylist,
- prefix=args.prefix)
- cluster.run()
- logger.debug(json.dumps(dict(cluster.counter)))
-
-
def run_verify(args):
"""
Run match verification over dataset from clustering step.
@@ -253,23 +215,6 @@ if __name__ == '__main__':
parser.add_argument("-v", "--verbose", help="be verbose", action='store_true')
subparsers = parser.add_subparsers()
- sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser])
- sub_cluster.set_defaults(func=run_cluster)
- sub_cluster.add_argument('-C',
- '--compress',
- action="store_true",
- help='compress intermediate results')
- sub_cluster.add_argument('-f', '--files', default="-", help='input files')
- sub_cluster.add_argument('--key-denylist', help='file path to key denylist')
- sub_cluster.add_argument('--min-cluster-size',
- default=2,
- type=int,
- help='ignore smaller clusters')
- sub_cluster.add_argument('-t',
- '--type',
- default='title',
- help='cluster algorithm: title, tnorm, tnysi, tss, tsandcrawler')
-
sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser])
sub_verify.add_argument('-f', '--files', default="-", help='input files')
sub_verify.add_argument('--max-cluster-size',
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
deleted file mode 100644
index 7994be7..0000000
--- a/fuzzycat/cluster.py
+++ /dev/null
@@ -1,454 +0,0 @@
-# pylint: disable=C0103
-"""
-Clustering stage.
-
-* [x] verify needs whole document
-* [ ] parallelization misses groups
-* [ ] cached match key store (tsv, sqlite3), something ~/.cache/...
-* [x] reproducibly run tests
-* [x] place for put md/tsv record tests
-
-----
-
-* [x] hadoop -> py (bn)
-* [ ] gnu parallel, share command line -- note (bn)
-
-----
-
-Ideas:
-
-* lookup potential matches; TSV [key, ...]; sort
-* maybe new "schema" - size vs "common schema" -- key <TAB> {"bibjson": ...}
-* merge-join
-
-```
-$ python -m fuzzycat keygen -s "algo" < ours | sort -k1,1 > a.tsv
-$ python -m fuzzycat keygen -s "algo" < other | sort -k1,1 > b.tsv
-$ merge-join a.tsv b.tsv
-```
-
-A couple of "keygen" algos.
-
-> 10k/s, 1B, ~day
-
-Partial fields should be ok.
-
-Q:
-
-* nysiis
-
-Deps.
-
-* pydantic; json "omitempty" -- get rid of it?
-* orjson (serialize datetime) -- maybe enough w/ dataclasses w/ dataclasses
-
-fuzzycat.main -> `__main__.py`
-
-* elasticsearch-py >> elasticsearch
-
-Matching releases to non-release entities.
-
-----
-
-Features and integration.
-
-* work grouping at import time; random pdfs; requires strong verification (vs refcat)
-* email out to OCI
-
-"""
-
-import collections
-import itertools
-import json
-import multiprocessing
-import operator
-import os
-import re
-import subprocess
-import sys
-import tempfile
-import unicodedata
-from dataclasses import dataclass
-from typing import IO, Any, Callable, Dict, Generator, List, Optional, Tuple
-
-import jellyfish
-import regex
-import zstandard
-
-from fuzzycat.utils import cut, shellout, slugify_string, zstdlines
-
-__all__ = [
- "release_key_title",
- "release_key_title_normalized",
- "release_key_title_nysiis",
- "release_key_title_sandcrawler",
- "Cluster",
-]
-
-
-@dataclass
-class KeyDoc:
- """
- A document from which we can derive a key, e.g. a release entity.
- """
- ident: str
- title: str
-
-
-get_ident_title = operator.itemgetter("ident", "title")
-ws_replacer = str.maketrans({"\t": " ", "\n": " "})
-non_word_re = re.compile(r'[\W_]+', re.UNICODE)
-
-# Notes: untie from release_entity, as we are only using a few fields. Maybe
-# it's a jsob blob, with a pydantic spec and schema.
-
-
-def release_key_title(doc: KeyDoc) -> Tuple[str, str]:
- ident, title = get_ident_title(doc)
- if not title:
- raise ValueError('title missing for {}'.format(ident))
- title = title.translate(ws_replacer).strip()
- return (ident, title)
-
-
-def release_key_title_normalized(doc: KeyDoc) -> Tuple[str, str]:
- ident, title = release_key_title(doc)
- title = re.sub(r'[ ]{2,}', ' ', title).lower()
- return (ident, non_word_re.sub('', title))
-
-
-def release_key_title_nysiis(doc: KeyDoc) -> Tuple[str, str]:
- """
- Use NYSIIS New York State Identification and Intelligence System.
- """
- ident, title = release_key_title(doc)
- return (ident, jellyfish.nysiis(title))
-
-
-# from http://zderadicka.eu/removing-diacritics-marks-from-strings/
-SANDCRAWLER_CHAR_MAP = {
- '\N{Latin capital letter AE}': 'AE',
- '\N{Latin small letter ae}': 'ae',
- '\N{Latin capital letter Eth}': 'D',
- '\N{Latin small letter eth}': 'd',
- '\N{Latin capital letter O with stroke}': 'O',
- '\N{Latin small letter o with stroke}': 'o',
- '\N{Latin capital letter Thorn}': 'Th',
- '\N{Latin small letter thorn}': 'th',
- '\N{Latin small letter sharp s}': 's',
- '\N{Latin capital letter D with stroke}': 'D',
- '\N{Latin small letter d with stroke}': 'd',
- '\N{Latin capital letter H with stroke}': 'H',
- '\N{Latin small letter h with stroke}': 'h',
- '\N{Latin small letter dotless i}': 'i',
- '\N{Latin small letter kra}': 'k',
- '\N{Latin capital letter L with stroke}': 'L',
- '\N{Latin small letter l with stroke}': 'l',
- '\N{Latin capital letter Eng}': 'N',
- '\N{Latin small letter eng}': 'n',
- '\N{Latin capital ligature OE}': 'Oe',
- '\N{Latin small ligature oe}': 'oe',
- '\N{Latin capital letter T with stroke}': 'T',
- '\N{Latin small letter t with stroke}': 't',
-
- # bnewbold additions; mostly Latin-ish OCR ambiguous
- '\N{MICRO SIGN}': 'u',
- '\N{LATIN SMALL LETTER C}': 'c',
- '\N{LATIN SMALL LETTER F WITH HOOK}': 'f',
- '\N{Greek Small Letter Alpha}': 'a',
- '\N{Greek Small Letter Beta}': 'b',
- '\N{Greek Small Letter Iota}': 'i',
- '\N{Greek Small Letter Kappa}': 'k',
- '\N{Greek Small Letter Chi}': 'x',
- '\N{Greek Small Letter Upsilon}': 'u',
- '\N{Greek Small Letter Nu}': 'v',
- '\N{Greek Small Letter Gamma}': 'y',
- '\N{Greek Small Letter Tau}': 't',
- '\N{Greek Small Letter Omicron}': 'o',
- # bnewbold map-to-null (for non-printing stuff not in the regex)
- '\N{PARTIAL DIFFERENTIAL}': '',
- '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '',
- '\N{N-ARY SUMMATION}': '',
- '\N{N-ARY PRODUCT}': '',
- '\N{MODIFIER LETTER CIRCUMFLEX ACCENT}': '',
- '\N{SNOWMAN}': '',
- '\N{CARON}': '',
-}
-
-SANDCRAWLER_PREFIX_REMOVE = [
- "original article: ",
- "original article ",
- "article: ",
- "title: ",
-]
-
-# regex that matches all characters which should be removed
-SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile(
- r"[\s\p{Punctuation}\p{M}\p{InCombiningDiacriticalMarks}\u2000-\u206F\u2E00-\u2E7F’·“”‘’“”«»「」¿–±§_`°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]"
-)
-
-
-def sandcrawler_slugify(raw: str) -> str:
- """
- Python re-implementation of sandcrawler Scala code for string comparison
- ("scorable" strings)
- """
- slug = raw.strip().lower()
-
- # transforms before running regex
- for prefix in SANDCRAWLER_PREFIX_REMOVE:
- if slug.startswith(prefix):
- slug = slug[:len(prefix)]
-
- slug = slug.replace("&apos;", "'")
-
- # iterate over all chars and replace from map, if in map; then lower-case again
- slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]).lower()
-
- # early bailout before executing regex
- if not slug:
- return ""
-
- slug = unicodedata.normalize('NFKD', slug)
- slug = SANDCRAWLER_REMOVE_CHAR_REGEX.sub('', slug)
-
- return slug.lower()
-
-
-def test_sandcrawler_slugify() -> None:
- test_cases = [
- ("", ""),
- ("asdf", "asdf"),
- ("'Hello World!'", "helloworld"),
- ("ASDF", "asdf"),
- ("as\n df", "asdf"),
- ("as\u0142 bb \u00f8", "aslbbo"),
- ("`hello¿", "hello"),
- ("علمية", "علمية"),
- ("期刊的数字", "期刊的数字"),
- ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"),
- ("γ-Globulin", "yglobulin"),
-
- # "MICRO SIGN"
- ("\xb5meter", "umeter"),
- # "GREEK SMALL LETTER MU"
- ("\u03bcmeter", "\u03bcmeter"),
-
- # TODO: ("salt &and; pepper", "saltpepper"),
- # TODO: ("new <b>and</b> improved", "newandimproved"),
-
- # some via https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt
- ("-9223372036854775808/-1", "92233720368547758081"),
- (r",./;'[]\-= <>?:\"{}|_+ !@#$%^&*()`~", ""),
- (" \n\r \x85 \u1680\u2002\u2003\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u202f\u205f\u3000",
- ""),
- (r"Ω≈ç√∫˜≤≥÷", "ωc"),
- (r"åß∂ƒ©˙∆˚¬…æ", "asfae"),
- (r"œ∑´®†¥¨ˆøπ“‘", "oeoπ"),
- (r"¡™£¢∞§¶•ªº–≠ ", "tmao"),
- (r"¸˛Ç◊ı˜Â¯˘¿", "cia"),
- (r"ÅÍÎÏ˝ÓÔÒÚÆ☃", "aiiiooouae"),
- (r"Œ„´‰ˇÁ¨ˆØ∏”’", "oeao"),
- (r"`⁄€‹›fifl‡°·‚—±", "fifl"),
- (r"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя",
- "еђгєѕііјљњћкиуџабвгдежзииклмнопрстуфхцчшщъыьэюяабвгдежзииклмнопрстуфхцчшщъыьэюя"),
- (r"⁰⁴⁵₀₁₂", "045012"),
- (r"社會科學院語學研究所", "社會科學院語學研究所"),
- # TODO: ("パーティーへ行かないか", "パーティーへ行かないか"),
- # TODO: ("表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀", "表ポあa鷗oeebB逍usaan丂㐀𠀀"),
- (r"( ͡° ͜ʖ ͡°)", ""),
- # emoji ok? I guess
- (r"👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"),
- (r"2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"),
- (r"﷽ ", "﷽"),
- (r"\"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
- "thenezperdianhivemindofchaoszalgo"),
- (r"The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"),
- (r"The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"),
- (r"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"),
- ]
-
- for in_str, out_str in test_cases:
- if sandcrawler_slugify(in_str) != out_str:
- for c in list(sandcrawler_slugify(in_str)):
- try:
- print(unicodedata.name(c))
- except ValueError:
- print(ord(c))
- #print(ord(c))
- print("----")
- for c in list(out_str):
- print(unicodedata.name(c))
- print(in_str)
- assert sandcrawler_slugify(in_str) == out_str
-
-
-def release_key_title_sandcrawler(doc: KeyDoc) -> Tuple[str, str]:
- ident, title = release_key_title(doc)
- slug = sandcrawler_slugify(title)
- return (ident, slug)
-
-
-def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
- """
- Derive a key from title.
-
- Tokenize title, remote stopwords, lookup first three, lookup last three,
- plus authors. TODO(miku): authors.
- """
- ident, title = get_ident_title(doc)
- slug_title = slugify_string(title)
- tokens = slug_title.split()
- if len(tokens) < 2 * n:
- key = ''.join(tokens)
- else:
- key = ''.join(tokens[:3] + tokens[-3:])
- return (ident, key)
-
-
-class Cluster:
- """
- Setup and run clustering over a potentially large (100m) number of records.
-
- Two main options are iterable (TODO: work on parsed docs), and the key
- function to apply to value to group by.
-
- TODO: We want compression.
- """
- def __init__(self,
- iterable: collections.abc.Iterable,
- key: Callable[[Any], Tuple[str, str]],
- output: IO[str] = sys.stdout,
- key_denylist: Optional[List[str]] = None,
- prefix: str = "fuzzycat-",
- tmpdir: str = tempfile.gettempdir(),
- strict: bool = False,
- min_cluster_size: int = 2,
- max_cluster_size: int = 100,
- compress=False,
- verbose=True):
- self.iterable: collections.abc.Iterable = iterable
- self.key: Callable[[Any], Tuple[str, str]] = key
- self.output: IO[str] = output
- self.prefix: str = prefix
- self.tmpdir: str = tmpdir
- self.strict = strict
- self.key_denylist = key_denylist
- self.min_cluster_size = min_cluster_size
- self.max_cluster_size = max_cluster_size
- self.verbose = verbose
- self.compress = compress
- self.counter: Dict[str, int] = collections.Counter({
- "key_fail": 0,
- "key_ok": 0,
- "key_empty": 0,
- "key_denylist": 0,
- "num_clusters": 0,
- })
-
- def run(self):
- """
- First map documents to keys, then group by keys, outline: json -> tsv
- -> sort -> group -> json.
- """
- with tempfile.NamedTemporaryFile(delete=False, mode="wb", prefix=self.prefix) as tf:
- if self.compress:
- zc = zstandard.ZstdCompressor(level=9, threads=multiprocessing.cpu_count())
- writer = zc.stream_writer(tf)
- else:
- writer = tf
- for i, line in enumerate(self.iterable):
- if self.verbose and i % 100000 == 0:
- print("@{}".format(i), file=sys.stderr)
- try:
- doc = json.loads(line)
- id, key = self.key(doc)
- except (KeyError, ValueError):
- if self.strict:
- raise
- self.counter["key_fail"] += 1
- continue
- if not key:
- self.counter["key_empty"] += 1
- continue
- if self.key_denylist and key in self.key_denylist:
- self.counter["key_denylist"] += 1
- continue
- self.counter["key_ok"] += 1
- # XXX: if the line itself contains tabs, we need to remove
- # them here; maybe offer TSV and JSON output and extra flag
- # XXX: this needs to be compressed (e.g. with 2B records, we
- # fill up disk too quickly)
- data = bytes("{}\t{}\t{}\n".format(id, key,
- line.replace("\t", " ").strip()),
- encoding="utf-8")
- writer.write(data)
- if self.compress:
- writer.flush(zstandard.FLUSH_FRAME)
-
- sf = self.sort(tf.name, opts='-k 2')
- if self.compress:
- f = zstdlines(sf)
- else:
- f = open(sf)
-
- for doc in self.group_by(f, key=cut(f=1)):
- if len(doc["v"]) < self.min_cluster_size:
- continue
- self.counter["num_clusters"] += 1
- json.dump(doc, self.output)
- self.output.write("\n")
-
- os.remove(sf)
- os.remove(tf.name)
- return self.counter
-
- def sort(self, filename: str, opts: str = "-k 2", fast: bool = True, mode: str = "w"):
- """
- Sort tabular file with sort(1), returns the filename of the sorted
- file. Options to sort can be passed in via opts keyword argument.
- """
- with tempfile.NamedTemporaryFile(delete=False, mode=mode, prefix=self.prefix) as tf:
- env = os.environ.copy()
- env["TMPDIR"] = self.tmpdir
- if fast:
- env["LC_ALL"] = "C"
- if self.compress:
- output = shellout(
- "zstdcat -T0 {input} | LC_ALL=C TMPDIR={tmpdir} sort {opts} | zstd -T0 -c9 > {output}",
- input=filename,
- tmpdir=self.tmpdir,
- opts=opts)
- else:
- subprocess.run(["sort"] + opts.split() + [filename], stdout=tf, env=env, check=True)
- output = tf.name
-
- return output
-
- def group_by(self,
- seq: collections.abc.Iterable,
- key: Callable[[Any], str] = None) -> Generator[Any, None, None]:
- """
- Extract a key from elements of an iterable and group them. Just as
- uniq(1), the input iterable must be ordered (by the key that is
- extracted) for this to work.
-
- There might be large clusters, which would currently exceed memory,
- hence the max_cluster_size option.
- """
- for k, g in itertools.groupby(seq, key=key):
- payload = []
- for i, line in enumerate(g):
- if i > 0 and i == self.max_cluster_size:
- print('max cluster size cut off for: {}'.format(k), file=sys.stderr)
- break
- # XXX: This is a bit too much "serde", get rid of this.
- fields = line.split("\t")
- if len(fields) < 3:
- continue
- payload.append(json.loads(fields[2]))
- doc = {
- "k": k.strip(),
- "v": payload,
- }
- yield doc
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index 310dfc2..bcda46d 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -73,7 +73,6 @@ def match_release_fuzzy(
if r:
return [r]
-
if release.title is not None and release.contribs is not None:
names = " ".join([c.raw_name for c in release.contribs])
body = {
@@ -178,7 +177,6 @@ def match_release_fuzzy(
if es_compat_hits_total(resp) > 0:
return response_to_entity_list(resp, entity_type=ReleaseEntity, size=size, api=api)
-
# TODO: perform more queries on other fields.
return []
diff --git a/fuzzycat/sandcrawler.py b/fuzzycat/sandcrawler.py
new file mode 100644
index 0000000..958756a
--- /dev/null
+++ b/fuzzycat/sandcrawler.py
@@ -0,0 +1,158 @@
+import regex
+import unicodedata
+
+# from http://zderadicka.eu/removing-diacritics-marks-from-strings/
+SANDCRAWLER_CHAR_MAP = {
+ '\N{Latin capital letter AE}': 'AE',
+ '\N{Latin small letter ae}': 'ae',
+ '\N{Latin capital letter Eth}': 'D',
+ '\N{Latin small letter eth}': 'd',
+ '\N{Latin capital letter O with stroke}': 'O',
+ '\N{Latin small letter o with stroke}': 'o',
+ '\N{Latin capital letter Thorn}': 'Th',
+ '\N{Latin small letter thorn}': 'th',
+ '\N{Latin small letter sharp s}': 's',
+ '\N{Latin capital letter D with stroke}': 'D',
+ '\N{Latin small letter d with stroke}': 'd',
+ '\N{Latin capital letter H with stroke}': 'H',
+ '\N{Latin small letter h with stroke}': 'h',
+ '\N{Latin small letter dotless i}': 'i',
+ '\N{Latin small letter kra}': 'k',
+ '\N{Latin capital letter L with stroke}': 'L',
+ '\N{Latin small letter l with stroke}': 'l',
+ '\N{Latin capital letter Eng}': 'N',
+ '\N{Latin small letter eng}': 'n',
+ '\N{Latin capital ligature OE}': 'Oe',
+ '\N{Latin small ligature oe}': 'oe',
+ '\N{Latin capital letter T with stroke}': 'T',
+ '\N{Latin small letter t with stroke}': 't',
+
+ # bnewbold additions; mostly Latin-ish OCR ambiguous
+ '\N{MICRO SIGN}': 'u',
+ '\N{LATIN SMALL LETTER C}': 'c',
+ '\N{LATIN SMALL LETTER F WITH HOOK}': 'f',
+ '\N{Greek Small Letter Alpha}': 'a',
+ '\N{Greek Small Letter Beta}': 'b',
+ '\N{Greek Small Letter Iota}': 'i',
+ '\N{Greek Small Letter Kappa}': 'k',
+ '\N{Greek Small Letter Chi}': 'x',
+ '\N{Greek Small Letter Upsilon}': 'u',
+ '\N{Greek Small Letter Nu}': 'v',
+ '\N{Greek Small Letter Gamma}': 'y',
+ '\N{Greek Small Letter Tau}': 't',
+ '\N{Greek Small Letter Omicron}': 'o',
+ # bnewbold map-to-null (for non-printing stuff not in the regex)
+ '\N{PARTIAL DIFFERENTIAL}': '',
+ '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '',
+ '\N{N-ARY SUMMATION}': '',
+ '\N{N-ARY PRODUCT}': '',
+ '\N{MODIFIER LETTER CIRCUMFLEX ACCENT}': '',
+ '\N{SNOWMAN}': '',
+ '\N{CARON}': '',
+}
+
+SANDCRAWLER_PREFIX_REMOVE = [
+ "original article: ",
+ "original article ",
+ "article: ",
+ "title: ",
+]
+
+# regex that matches all characters which should be removed
+SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile(
+ r"[\s\p{Punctuation}\p{M}\p{InCombiningDiacriticalMarks}\u2000-\u206F\u2E00-\u2E7F’·“”‘’“”«»「」¿–±§_`°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]"
+)
+
+def sandcrawler_slugify(raw: str) -> str:
+ """
+ Python re-implementation of sandcrawler Scala code for string comparison
+ ("scorable" strings)
+ """
+ slug = raw.strip().lower()
+
+ # transforms before running regex
+ for prefix in SANDCRAWLER_PREFIX_REMOVE:
+ if slug.startswith(prefix):
+ slug = slug[:len(prefix)]
+
+ slug = slug.replace("&apos;", "'")
+
+ # iterate over all chars and replace from map, if in map; then lower-case again
+ slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]).lower()
+
+ # early bailout before executing regex
+ if not slug:
+ return ""
+
+ slug = unicodedata.normalize('NFKD', slug)
+ slug = SANDCRAWLER_REMOVE_CHAR_REGEX.sub('', slug)
+
+ return slug.lower()
+
+
+def test_sandcrawler_slugify() -> None:
+ test_cases = [
+ ("", ""),
+ ("asdf", "asdf"),
+ ("'Hello World!'", "helloworld"),
+ ("ASDF", "asdf"),
+ ("as\n df", "asdf"),
+ ("as\u0142 bb \u00f8", "aslbbo"),
+ ("`hello¿", "hello"),
+ ("علمية", "علمية"),
+ ("期刊的数字", "期刊的数字"),
+ ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"),
+ ("γ-Globulin", "yglobulin"),
+
+ # "MICRO SIGN"
+ ("\xb5meter", "umeter"),
+ # "GREEK SMALL LETTER MU"
+ ("\u03bcmeter", "\u03bcmeter"),
+
+ # TODO: ("salt &and; pepper", "saltpepper"),
+ # TODO: ("new <b>and</b> improved", "newandimproved"),
+
+ # some via https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt
+ ("-9223372036854775808/-1", "92233720368547758081"),
+ (r",./;'[]\-= <>?:\"{}|_+ !@#$%^&*()`~", ""),
+ (" \n\r \x85 \u1680\u2002\u2003\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u202f\u205f\u3000",
+ ""),
+ (r"Ω≈ç√∫˜≤≥÷", "ωc"),
+ (r"åß∂ƒ©˙∆˚¬…æ", "asfae"),
+ (r"œ∑´®†¥¨ˆøπ“‘", "oeoπ"),
+ (r"¡™£¢∞§¶•ªº–≠ ", "tmao"),
+ (r"¸˛Ç◊ı˜Â¯˘¿", "cia"),
+ (r"ÅÍÎÏ˝ÓÔÒÚÆ☃", "aiiiooouae"),
+ (r"Œ„´‰ˇÁ¨ˆØ∏”’", "oeao"),
+ (r"`⁄€‹›fifl‡°·‚—±", "fifl"),
+ (r"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя",
+ "еђгєѕііјљњћкиуџабвгдежзииклмнопрстуфхцчшщъыьэюяабвгдежзииклмнопрстуфхцчшщъыьэюя"),
+ (r"⁰⁴⁵₀₁₂", "045012"),
+ (r"社會科學院語學研究所", "社會科學院語學研究所"),
+ # TODO: ("パーティーへ行かないか", "パーティーへ行かないか"),
+ # TODO: ("表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀", "表ポあa鷗oeebB逍usaan丂㐀𠀀"),
+ (r"( ͡° ͜ʖ ͡°)", ""),
+ # emoji ok? I guess
+ (r"👾 🙇 💁 🙅 🙆 🙋 🙎 🙍", "👾🙇💁🙅🙆🙋🙎🙍"),
+ (r"2️⃣ 3️⃣ 4️⃣ 5️⃣", "2345"),
+ (r"﷽ ", "﷽"),
+ (r"\"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
+ "thenezperdianhivemindofchaoszalgo"),
+ (r"The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"),
+ (r"The quick brown fox jumps over the lazy dog", "thequickbrownfoxjumpsoverthelazydog"),
+ (r"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 ", "thequickbrownfoxjumpsoverthelazydog"),
+ ]
+
+ for in_str, out_str in test_cases:
+ if sandcrawler_slugify(in_str) != out_str:
+ for c in list(sandcrawler_slugify(in_str)):
+ try:
+ print(unicodedata.name(c))
+ except ValueError:
+ print(ord(c))
+ print("----")
+ for c in list(out_str):
+ print(unicodedata.name(c))
+ print(in_str)
+ assert sandcrawler_slugify(in_str) == out_str
+
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 303daf6..24e103a 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -26,12 +26,12 @@ def es_compat_hits_total(resp):
https://www.elastic.co/guide/en/elasticsearch/reference/current/breaking-changes-7.0.html
It is responsibility of the call site to set `track_total_hits` in ES7 to
- get an exact number.
+ get an exact number (https://www.elastic.co/guide/en/elasticsearch/reference/master/search-your-data.html#track-total-hits).
"""
try:
- return resp["hits"]["total"]["value"]
+ return resp["hits"]["total"]["value"] # ES7
except TypeError:
- return resp["hits"]["total"]
+ return resp["hits"]["total"] # ES6
def parse_page_string(s):
@@ -44,6 +44,8 @@ def parse_page_string(s):
Does not handle lists of page numbers, roman numerals, and several other
patterns.
+
+ Returns a named tuple with start, end and count fields.
"""
if not s:
raise ValueError('page parsing: empty string')
@@ -69,7 +71,7 @@ def parse_page_string(s):
return ParsedPages(start=a, end=b, count=count)
-def dict_key_exists(doc, path):
+def dict_has_key(doc, path):
"""
Return true, if key in a dictionary at a given path exists. XXX: probably
already in glom.
@@ -101,7 +103,10 @@ def doi_prefix(v):
"""
Return the prefix of a DOI.
"""
- return v.split("/")[0]
+ parts = v.split("/")
+ if len(parts) == 1:
+ raise ValueError("invalid doi: {}".format(v))
+ return parts[0]
def has_doi_prefix(v, prefix="10.1234"):
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 5b90c47..9eb808b 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -91,7 +91,7 @@ from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_
TITLE_FRAGMENT_BLACKLIST)
from fuzzycat.entities import entity_to_dict
from fuzzycat.utils import (author_similarity_score, clean_doi, contains_chemical_formula,
- dict_key_exists, doi_prefix, has_doi_prefix, jaccard, num_project,
+ dict_has_key, doi_prefix, has_doi_prefix, jaccard, num_project,
parse_page_string, slugify_string)
Verify = collections.namedtuple("Verify", "status reason")
@@ -233,10 +233,10 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"):
if a_doi + "u" == b_doi or b_doi + "u" == a_doi:
return Verify(Status.STRONG, Reason.CUSTOM_BSI_UNDATED)
- if a_title == b_title and ((dict_key_exists(a, "extra.subtitle")
- and not dict_key_exists(b, "extra.subtitle")) or
- (dict_key_exists(b, "extra.subtitle")
- and not dict_key_exists(a, "extra.subtitle"))):
+ if a_title == b_title and ((dict_has_key(a, "extra.subtitle")
+ and not dict_has_key(b, "extra.subtitle")) or
+ (dict_has_key(b, "extra.subtitle")
+ and not dict_has_key(a, "extra.subtitle"))):
return Verify(Status.STRONG, Reason.CUSTOM_BSI_SUBDOC)
except PathAccessError:
pass
@@ -301,7 +301,7 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
# beware: we have versions and "isPartOf", e.g.
# https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4
# Datacite md schema: https://doi.org/10.14454/7xq3-zf69
- if dict_key_exists(a, "extra.datacite") and dict_key_exists(b, "extra.datacite"):
+ if dict_has_key(a, "extra.datacite") and dict_has_key(b, "extra.datacite"):
whitelist = set([
"HasPart",
"HasVersion",
@@ -511,8 +511,8 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
# if any([a_authors, b_authors]) and not (a_authors and b_authors):
# Does not cover case, where both authors are empty.
if a_release_year == b_release_year and a_title_lower == b_title_lower:
- if ((dict_key_exists(a, "ext_ids.pmid") and dict_key_exists(b, "ext_ids.doi"))
- or (dict_key_exists(b, "ext_ids.pmid") and dict_key_exists(a, "ext_ids.doi"))):
+ if ((dict_has_key(a, "ext_ids.pmid") and dict_has_key(b, "ext_ids.doi"))
+ or (dict_has_key(b, "ext_ids.pmid") and dict_has_key(a, "ext_ids.doi"))):
return Verify(Status.STRONG, Reason.PMID_DOI_PAIR)
# Two JSTOR items will probably be different.
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
deleted file mode 100644
index 55b349a..0000000
--- a/tests/test_cluster.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import collections
-import io
-import json
-import os
-import tempfile
-
-import pytest
-
-from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
- release_key_title_nysiis)
-
-Case = collections.namedtuple("Case", 'input output')
-
-
-def test_release_key_title():
- with pytest.raises(KeyError):
- release_key_title({})
- with pytest.raises(KeyError, match='title'):
- release_key_title({'ident': '123'})
- with pytest.raises(KeyError, match='ident'):
- release_key_title({'title': 'deep learning backdoor'})
- with pytest.raises(ValueError, match='title.*missing'):
- release_key_title({'ident': '', 'title': ''})
- cases = (
- Case(input={
- 'ident': '',
- 'title': 'simhash'
- }, output=('', 'simhash')),
- Case(input={
- 'ident': '',
- 'title': 'Simhash'
- }, output=('', 'Simhash')),
- Case(input={
- 'ident': '',
- 'title': 'Sim hash'
- }, output=('', 'Sim hash')),
- )
- for case in cases:
- assert case.output == release_key_title(case.input)
-
-
-def test_release_key_title_normalized():
- cases = (
- Case(input={
- 'ident': '',
- 'title': 'simhash'
- }, output=('', 'simhash')),
- Case(input={
- 'ident': '',
- 'title': 'Simhash'
- }, output=('', 'simhash')),
- Case(input={
- 'ident': '',
- 'title': 'Sim hash'
- }, output=('', 'simhash')),
- Case(input={
- 'ident': '',
- 'title': 'THE year 1929'
- }, output=('', 'theyear1929')),
- Case(input={
- 'ident': '',
- 'title': '2019?'
- }, output=('', '2019')),
- Case(input={
- 'ident': '123',
- 'title': 'H~~2019?'
- }, output=('123', 'h2019')),
- )
- for case in cases:
- assert case.output == release_key_title_normalized(case.input), 'failed case {}'.format(
- case.input)
-
-
-def test_release_key_title_nysiis():
- cases = (
- Case(input={
- 'ident': '',
- 'title': 'simhash'
- }, output=('', 'SANM')),
- Case(input={
- 'ident': '',
- 'title': 'Simhash'
- }, output=('', 'SANM')),
- Case(input={
- 'ident': '',
- 'title': 'Sim hash'
- }, output=('', 'SAN')),
- Case(input={
- 'ident': '',
- 'title': 'THE year 1929'
- }, output=('', 'T')),
- Case(input={
- 'ident': '',
- 'title': '2019?'
- }, output=('', '2019?')),
- Case(input={
- 'ident': '123',
- 'title': 'H~~2019?'
- }, output=('123', 'H~2019?')),
- Case(input={
- 'ident': '123',
- 'title': '世界'
- }, output=('123', '世界')),
- )
- for case in cases:
- assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(
- case.input)
-
-
-def test_cluster():
- sio = io.StringIO()
- lines = [
- json.dumps(doc) for doc in [
- {
- "title": "hello world",
- "ident": 1,
- },
- {
- "title": "hello world!",
- "ident": 2,
- },
- ]
- ]
- cluster = Cluster(lines, release_key_title_normalized, output=sio)
- stats = cluster.run()
- assert stats == {
- "key_fail": 0,
- "key_ok": 2,
- "key_empty": 0,
- "key_denylist": 0,
- "num_clusters": 1
- }
- assert json.loads(sio.getvalue()) == {
- "k": "helloworld",
- "v": [{
- "title": "hello world!",
- "ident": 2
- }, {
- "title": "hello world",
- "ident": 1
- }]
- }
-
- sio = io.StringIO()
- cluster = Cluster([
- json.dumps(line) for line in [
- {
- "title": "hello world",
- "ident": 1
- },
- {
- "title": "hello world!",
- "ident": 2
- },
- {
- "title": "other",
- "ident": 3
- },
- ]
- ],
- release_key_title_normalized,
- min_cluster_size=1,
- output=sio)
- stats = cluster.run()
- assert stats == {
- "key_fail": 0,
- "key_ok": 3,
- "key_empty": 0,
- "key_denylist": 0,
- "num_clusters": 2
- }
- assert [json.loads(line) for line in sio.getvalue().split("\n") if line] == [{
- "k":
- "helloworld",
- "v": [{
- "title": "hello world!",
- "ident": 2
- }, {
- "title": "hello world",
- "ident": 1
- }]
- }, {
- 'k':
- 'other',
- 'v': [{
- 'ident': 3,
- 'title': 'other'
- }]
- }]
diff --git a/tests/test_matching.py b/tests/test_matching.py
index 90d1fee..ad971a5 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -81,7 +81,9 @@ def test_match_release_fuzzy(es_client, caplog):
}, 2),
({
"title": "",
- "contribs": [{"raw_name": "Aristoteles"}],
+ "contribs": [{
+ "raw_name": "Aristoteles"
+ }],
"ext_ids": {}
}, 5),
# ({
@@ -102,5 +104,5 @@ def test_match_release_fuzzy(es_client, caplog):
result = match_release_fuzzy(entity, es=es_client)
with caplog.at_level(logging.INFO):
logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result),
- [v.title for v in result]))
+ [v.title for v in result]))
assert len(result) == count, doc
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 957203f..b2242b8 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,7 +2,7 @@ import pytest
import os
from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string,
- token_n_grams, tokenize_string, parse_page_string, dict_key_exists,
+ token_n_grams, tokenize_string, parse_page_string, dict_has_key,
zstdlines, es_compat_hits_total, clean_doi)
@@ -67,13 +67,13 @@ def test_nwise():
assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)]
-def test_dict_key_exists():
- assert dict_key_exists({}, "") is False
- assert dict_key_exists({"a": "a"}, "a") == True
- assert dict_key_exists({"a": "a"}, "b") == False
- assert dict_key_exists({"a": {"b": "c"}}, "a.b") == True
- assert dict_key_exists({"a": {"b": None}}, "a.b") == True
- assert dict_key_exists({"a": {"b": "c"}}, "a.b.c") == False
+def test_dict_has_key():
+ assert dict_has_key({}, "") is False
+ assert dict_has_key({"a": "a"}, "a") == True
+ assert dict_has_key({"a": "a"}, "b") == False
+ assert dict_has_key({"a": {"b": "c"}}, "a.b") == True
+ assert dict_has_key({"a": {"b": None}}, "a.b") == True
+ assert dict_has_key({"a": {"b": "c"}}, "a.b.c") == False
def test_page_page_string():