cleanup and docs

author: Martin Czygan <martin.czygan@gmail.com> 2020-11-23 23:39:15 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-11-23 23:39:15 +0100
commit: 4461880f3c3196e7c06f2f25387a01cd4f514027 (patch)
tree: 2b2a15d2e0db1698da3bc345979ca1883faafbbb
parent: 6888c847abe79856772e484a3342945e6e280f72 (diff)
download: fuzzycat-4461880f3c3196e7c06f2f25387a01cd4f514027.tar.gz
fuzzycat-4461880f3c3196e7c06f2f25387a01cd4f514027.zip
2 files changed, 15 insertions, 42 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 2b01657..28d0417 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -81,25 +81,11 @@ __all__ = [
     "release_key_title_normalized",
     "release_key_title_nysiis",
     "release_key_title_sandcrawler",
-    "sort_by_column",
-    "group_by",
     "Cluster",
 ]
 
 
 @dataclass
-class Contrib:
-    """
-    A contributor.
-    """
-    index: Optional[int]
-    raw_name: Optional[str]
-    given_name: Optional[str]
-    surname: Optional[str]
-    role: Optional[str]
-
-
-@dataclass
 class KeyDoc:
     """
     A document from which we can derive a key, e.g. a release entity.
@@ -108,18 +94,6 @@ class KeyDoc:
     title: str
 
 
-@dataclass
-class ClusterResult:
-    """
-    Result of clustering, one key and a list of
-
-    A first approach: pass document through.
-    """
-    key: str
-    comment: str
-    docs: List[Any] = field(default_factory=list)
-
-
 get_ident_title = operator.itemgetter("ident", "title")
 ws_replacer = str.maketrans({"\t": " ", "\n": " "})
 non_word_re = re.compile(r'[\W_]+', re.UNICODE)
@@ -347,7 +321,7 @@ def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
 
 class Cluster:
     """
-    Runs clustering over a potentially large number of records.
+    Setup and run clustering over a potentially large number of records.
     """
     def __init__(self,
                  iterable: collections.abc.Iterable,
@@ -357,12 +331,15 @@ class Cluster:
                  prefix: str = "fuzzycat-",
                  tmpdir: str = tempfile.gettempdir(),
                  strict: bool = False,
-                 max_cluster_size=100):
+                 max_cluster_size: int = 100):
         self.iterable: collections.abc.Iterable = iterable
         self.key: Callable[[Any], Tuple[str, str]] = key
         self.output: IO[str] = output
         self.prefix: str = prefix
         self.tmpdir: str = tmpdir
+        self.strict = strict
+        self.key_denylist = key_denylist
+        self.max_cluster_size = max_cluster_size
         self.counter: Dict[str, int] = collections.Counter({
             "key_fail": 0,
             "key_ok": 0,
@@ -370,15 +347,11 @@ class Cluster:
             "key_denylist": 0,
             "num_clusters": 0,
         })
-        self.strict = strict
-        self.key_denylist = key_denylist
-        self.max_cluster_size = max_cluster_size
 
     def run(self):
         """
-        First map documents to keys, then group by keys.
-
-        Outline: json -> tsv -> sort -> group -> json
+        First map documents to keys, then group by keys, outline: json -> tsv
+        -> sort -> group -> json.
         """
         with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
             for i, line in enumerate(self.iterable):
@@ -416,8 +389,8 @@ class Cluster:
 
     def sort(self, filename: str, opts: str = "-k 2", fast: bool = True, mode: str = "w"):
         """
-        Sort tabular file with sort(1), returns the filename of the sorted file.
-        TODO: use separate /fast/tmp for sort.
+        Sort tabular file with sort(1), returns the filename of the sorted
+        file. Options to sort can be passed in via opts keyword argument.
         """
         with tempfile.NamedTemporaryFile(delete=False, mode=mode, prefix=self.prefix) as tf:
             env = os.environ.copy()
@@ -433,11 +406,11 @@ class Cluster:
                  key: Callable[[Any], str] = None) -> Generator[Any, None, None]:
         """
         Extract a key from elements of an iterable and group them. Just as
-        uniq(1), the iterable must be ordered (by the key that is extracted)
-        for this to work.
+        uniq(1), the input iterable must be ordered (by the key that is
+        extracted) for this to work.
 
-        There might be large clusters, which would currently exceed memory.
-        Mitigate by splitting large clusters into parts.
+        There might be large clusters, which would currently exceed memory,
+        hence the max_cluster_size option.
         """
         for k, g in itertools.groupby(seq, key=key):
             payload = []
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index eeafaf6..c322f39 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -189,8 +189,8 @@ def compare(a, b):
 
     # TODO: figshare versions, "xxx.v1"
     FIGSHARE_PREFIX = "10.6084"
-    if a.get("doi") and b.get("doi") and a.get("doi").startswith(FIGSHARE_PREFIX + "/") and b.get("doi").startswith(FIGSHARE_PREFIX +
-                                                                                  "/"):
+    if a.get("doi") and b.get("doi") and a.get("doi").startswith(FIGSHARE_PREFIX + "/") and b.get(
+            "doi").startswith(FIGSHARE_PREFIX + "/"):
         a_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("doi"))
         b_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("doi"))
         if a_doi_v_stripped == b_doi_v_stripped:
author	Martin Czygan <martin.czygan@gmail.com>	2020-11-23 23:39:15 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-11-23 23:39:15 +0100
commit	4461880f3c3196e7c06f2f25387a01cd4f514027 (patch)
tree	2b2a15d2e0db1698da3bc345979ca1883faafbbb
parent	6888c847abe79856772e484a3342945e6e280f72 (diff)
download	fuzzycat-4461880f3c3196e7c06f2f25387a01cd4f514027.tar.gz fuzzycat-4461880f3c3196e7c06f2f25387a01cd4f514027.zip