diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-10 17:49:24 +0100 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-10 17:49:24 +0100 | 
| commit | 219fded46071a98434e28ca100c1a5cbc33fcba1 (patch) | |
| tree | 1be8ced5279ee7f79a697bf0178b35f0db53273a /fuzzycat | |
| parent | 414ea00bdb51766a27375e6ca058e4178fb71b1b (diff) | |
| download | fuzzycat-219fded46071a98434e28ca100c1a5cbc33fcba1.tar.gz fuzzycat-219fded46071a98434e28ca100c1a5cbc33fcba1.zip | |
add tss key option
Diffstat (limited to 'fuzzycat')
| -rw-r--r-- | fuzzycat/build.py | 2 | ||||
| -rw-r--r-- | fuzzycat/cluster.py | 38 | ||||
| -rw-r--r-- | fuzzycat/main.py | 7 | 
3 files changed, 33 insertions, 14 deletions
| diff --git a/fuzzycat/build.py b/fuzzycat/build.py index 687c17e..e49e7d7 100644 --- a/fuzzycat/build.py +++ b/fuzzycat/build.py @@ -1,5 +1,5 @@  """ -Build auxiliary data structures. +WIP: Build auxiliary data structures for lookup.  """  import fileinput diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index db20320..fa54ec6 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -52,32 +52,38 @@ class KeyDoc(BaseModel):  get_ident_title = operator.itemgetter("ident", "title")  ws_replacer = str.maketrans({"\t": " ", "\n": " "})  non_word_re = re.compile(r'[\W_]+', re.UNICODE) +printable_no_punct = string.digits + string.letters + string.whitespace + +def slugify_string(s: str) -> str: +    """ +    Keeps ascii chars and single whitespace only. +    """ +    return ''.join((c for c in s.lower() if c in printable_no_punct))  # Notes: untie from release_entity, as we are only using a few fields. Maybe  # it's a jsob blob, with a pydantic spec and schema.  def release_key_title(doc: KeyDoc) -> Tuple[str, str]: -    id, title = get_ident_title(doc) +    ident, title = get_ident_title(doc)      if not title: -        raise ValueError('title missing') +        raise ValueError('title missing for {}'.format(ident))      title = title.translate(ws_replacer).strip() -    return (id, title) +    return (ident, title)  def release_key_title_normalized(doc: KeyDoc) -> Tuple[str, str]: -    id, title = release_key_title(doc) -    title = re.sub(r'[ ]{2,}', ' ', title) -    title = title.lower() -    return (id, non_word_re.sub('', title)) +    ident, title = release_key_title(doc) +    title = re.sub(r'[ ]{2,}', ' ', title).lower() +    return (ident, non_word_re.sub('', title))  def release_key_title_nysiis(doc: KeyDoc) -> Tuple[str, str]: -    id, title = release_key_title(doc) -    return (id, fuzzy.nysiis(title)) +    ident, title = release_key_title(doc) +    return (ident, fuzzy.nysiis(title)) -def release_key_title_authors_ngram(doc: KeyDoc) -> Tuple[str, str]: +def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:      """      Derive a key from title and authors. Authors in contribs list: @@ -90,10 +96,16 @@ def release_key_title_authors_ngram(doc: KeyDoc) -> Tuple[str, str]:      ],      Tokenize title, remote stopwords, lookup first three, lookup last three, -    plus authors. +    plus authors. TODO(miku): authors.      """ -    # SS: compare ngram sets? - +    ident, title = get_ident_title(doc) +    slug_title = slug_title(title) +    tokens = slug_title.split() +    if len(tokens) < 2 * n: +        key = ''.join(tokens) +    else: +        key = ''.join(tokens[:3] + tokens[-3:]) +    return (ident, key)  def sort_by_column(filename: str,                     opts: str = "-k 2", diff --git a/fuzzycat/main.py b/fuzzycat/main.py index dfc0925..7d298e6 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -32,6 +32,7 @@ def run_cluster(args):          'title': release_key_title,          'tnorm': release_key_title_normalized,          'tnysi': release_key_title_nysiis, +        'tss': release_key_title_ngram,      }      cluster = Cluster(files=args.files,                        keyfunc=types.get(args.type), @@ -42,10 +43,16 @@ def run_cluster(args):  def run_verify(args): +    """ +    TODO. +    """      print('verify')  def run_build(args): +    """ +    Trying out. +    """      if args.type == "ss":          builder = NgramLookup(files=args.files, output=args.output)          builder.run() | 
