aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-06 00:15:48 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-06 00:15:48 +0100
commitf48e724619809d4267c2a9ff3fd75df10cb89a6d (patch)
tree04082a0a8bbb0e2321308b3eb0fa2a2d67a7a830
parent34e1716a5d6c3c22a8f4ae429759b1ca199e7ca4 (diff)
downloadfuzzycat-f48e724619809d4267c2a9ff3fd75df10cb89a6d.tar.gz
fuzzycat-f48e724619809d4267c2a9ff3fd75df10cb89a6d.zip
add type annotations
-rw-r--r--.mypy.ini3
-rw-r--r--Makefile4
-rw-r--r--fuzzycat/cluster.py34
-rw-r--r--fuzzycat/verify.py2
4 files changed, 26 insertions, 17 deletions
diff --git a/.mypy.ini b/.mypy.ini
new file mode 100644
index 0000000..ebcf395
--- /dev/null
+++ b/.mypy.ini
@@ -0,0 +1,3 @@
+[mypy]
+ignore_missing_imports = True
+
diff --git a/Makefile b/Makefile
index 562b68d..51aeaa2 100644
--- a/Makefile
+++ b/Makefile
@@ -37,6 +37,10 @@ test: ## Run coverage report
lint: $(PY_FILES)
pylint fuzzycat
+.PHONY: mypy
+mypy:
+ mypy --strict $$(find fuzzycat -name "*py")
+
.PHONY: clean
clean: ## Clean all artifacts
rm -rf build
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 57b2d3e..755e94f 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -14,7 +14,7 @@ import re
import subprocess
import sys
import tempfile
-from typing import List, Optional
+from typing import Any, Callable, Dict, Generator, List, Optional, Tuple
import fuzzy
from pydantic import BaseModel
@@ -49,14 +49,6 @@ class KeyDoc(BaseModel):
contribs: Optional[List[Contrib]]
-class MapResult(BaseModel):
- """
- Result of deriving a key from a doc.
- """
- id: str
- value: str
-
-
get_ident_title = operator.itemgetter("ident", "title")
ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile(r'[\W_]+', re.UNICODE)
@@ -65,7 +57,7 @@ non_word_re = re.compile(r'[\W_]+', re.UNICODE)
# it's a jsob blob, with a pydantic spec and schema.
-def release_key_title(doc: KeyDoc) -> MapResult:
+def release_key_title(doc: KeyDoc) -> Tuple[str, str]:
id, title = get_ident_title(doc)
if not title:
raise ValueError('title missing')
@@ -73,19 +65,19 @@ def release_key_title(doc: KeyDoc) -> MapResult:
return (id, title)
-def release_key_title_normalized(doc: KeyDoc) -> MapResult:
+def release_key_title_normalized(doc: KeyDoc) -> Tuple[str, str]:
id, title = release_key_title(doc)
title = re.sub(r'[ ]{2,}', ' ', title)
title = title.lower()
return (id, non_word_re.sub('', title))
-def release_key_title_nysiis(doc: KeyDoc) -> MapResult:
+def release_key_title_nysiis(doc: KeyDoc) -> Tuple[str, str]:
id, title = release_key_title(doc)
return (id, fuzzy.nysiis(title))
-def release_key_title_authors_ngram(doc: KeyDoc) -> MapResult:
+def release_key_title_authors_ngram(doc: KeyDoc) -> Tuple[str, str]:
"""
Derive a key from title and authors. Authors in contribs list:
@@ -102,7 +94,12 @@ def release_key_title_authors_ngram(doc: KeyDoc) -> MapResult:
# SS: compare ngram sets?
-def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-", tmpdir=None):
+def sort_by_column(filename: str,
+ opts: str = "-k 2",
+ fast: bool = True,
+ mode: str = "w",
+ prefix: str = "fuzzycat-",
+ tmpdir: Optional[str] = None):
"""
Sort tabular file with sort(1), returns the filename of the sorted file.
TODO: use separate /fast/tmp for sort.
@@ -118,7 +115,10 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-
return tf.name
-def group_by(seq, key=None, value=None, comment=""):
+def group_by(seq: collections.abc.Iterable,
+ key: Callable[[Any], str] = None,
+ value: Callable[[Any], str] = None,
+ comment: str = "") -> Generator[Any, None, None]:
"""
Iterate over lines in filename, group by key (a callable deriving the key
from the line), then apply value callable on the same value to emit a
@@ -135,7 +135,7 @@ def group_by(seq, key=None, value=None, comment=""):
yield doc
-def cut(f=0, sep='\t', ignore_missing_column=True):
+def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
"""
Return a callable, that extracts a given column from a file with a specific
separator. TODO: move this into more generic place.
@@ -176,7 +176,7 @@ class Cluster:
Run clustering and write output to given stream or file.
"""
keyfunc = self.keyfunc # Save a lookup in loop.
- counter = collections.Counter()
+ counter: Dict[str, int] = collections.Counter()
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
for line in fileinput.input(files=self.files):
try:
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 9f5eaa8..a9cc799 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -24,6 +24,8 @@ store, or some other cache
"""
+import requests
+
def fetch_release_entity(ident, api="https://api.fatcat.wiki/v0"):
"""