diff options
-rw-r--r-- | fuzzycat/cluster.py | 31 | ||||
-rw-r--r-- | fuzzycat/main.py | 2 |
2 files changed, 14 insertions, 19 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index e4a36bf..2005c2f 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -4,8 +4,8 @@ Clustering stage. * [ ] verify needs whole document * [ ] parallelization misses groups -* [ ] cached match key store (sqlite3), something ~/.cache/... -* [ ] reproducibly run test +* [ ] cached match key store (tsv, sqlite3), something ~/.cache/... +* [ ] reproducibly run tests * [ ] place for put md record tests ---- @@ -72,7 +72,7 @@ import tempfile from typing import Any, Callable, Dict, Generator, List, Optional, Tuple import fuzzy -from pydantic import BaseModel +from dataclasses import dataclass, field __all__ = [ "release_key_title", @@ -84,7 +84,8 @@ __all__ = [ ] -class Contrib(BaseModel): +@dataclass +class Contrib: """ A contributor. """ @@ -94,29 +95,25 @@ class Contrib(BaseModel): surname: Optional[str] role: Optional[str] - -class KeyDoc(BaseModel): +@dataclass +class KeyDoc: """ A document from which we can derive a key, e.g. a release entity. """ ident: str - title: Optional[str] - contribs: Optional[List[Contrib]] + title: str + contribs: List[Contrib] = field(default_factory=list) -class ClusterResult(BaseModel): +@dataclass +class ClusterResult: """ - Result of clustering. + Result of clustering, one key and a list of - XXX: We could also include the complete document, that would keep it simple - at the expense of a few more things to read. + A first approach: pass document through. """ key: str - values: List[str] comment: str - ids: str - title: str - contribs: str - year: str + docs: List[Any] = field(default_factory=list) get_ident_title = operator.itemgetter("ident", "title") diff --git a/fuzzycat/main.py b/fuzzycat/main.py index c7ba23d..6086798 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -19,8 +19,6 @@ import sys import tempfile import fileinput -import orjson as json - from fuzzycat.build import NgramLookup, TitleTokenList from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized, release_key_title_nysiis, release_key_title_ngram) |