aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/cluster.py53
-rw-r--r--fuzzycat/verify.py4
2 files changed, 15 insertions, 42 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 2b01657..28d0417 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -81,25 +81,11 @@ __all__ = [
"release_key_title_normalized",
"release_key_title_nysiis",
"release_key_title_sandcrawler",
- "sort_by_column",
- "group_by",
"Cluster",
]
@dataclass
-class Contrib:
- """
- A contributor.
- """
- index: Optional[int]
- raw_name: Optional[str]
- given_name: Optional[str]
- surname: Optional[str]
- role: Optional[str]
-
-
-@dataclass
class KeyDoc:
"""
A document from which we can derive a key, e.g. a release entity.
@@ -108,18 +94,6 @@ class KeyDoc:
title: str
-@dataclass
-class ClusterResult:
- """
- Result of clustering, one key and a list of
-
- A first approach: pass document through.
- """
- key: str
- comment: str
- docs: List[Any] = field(default_factory=list)
-
-
get_ident_title = operator.itemgetter("ident", "title")
ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile(r'[\W_]+', re.UNICODE)
@@ -347,7 +321,7 @@ def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
class Cluster:
"""
- Runs clustering over a potentially large number of records.
+ Setup and run clustering over a potentially large number of records.
"""
def __init__(self,
iterable: collections.abc.Iterable,
@@ -357,12 +331,15 @@ class Cluster:
prefix: str = "fuzzycat-",
tmpdir: str = tempfile.gettempdir(),
strict: bool = False,
- max_cluster_size=100):
+ max_cluster_size: int = 100):
self.iterable: collections.abc.Iterable = iterable
self.key: Callable[[Any], Tuple[str, str]] = key
self.output: IO[str] = output
self.prefix: str = prefix
self.tmpdir: str = tmpdir
+ self.strict = strict
+ self.key_denylist = key_denylist
+ self.max_cluster_size = max_cluster_size
self.counter: Dict[str, int] = collections.Counter({
"key_fail": 0,
"key_ok": 0,
@@ -370,15 +347,11 @@ class Cluster:
"key_denylist": 0,
"num_clusters": 0,
})
- self.strict = strict
- self.key_denylist = key_denylist
- self.max_cluster_size = max_cluster_size
def run(self):
"""
- First map documents to keys, then group by keys.
-
- Outline: json -> tsv -> sort -> group -> json
+ First map documents to keys, then group by keys, outline: json -> tsv
+ -> sort -> group -> json.
"""
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
for i, line in enumerate(self.iterable):
@@ -416,8 +389,8 @@ class Cluster:
def sort(self, filename: str, opts: str = "-k 2", fast: bool = True, mode: str = "w"):
"""
- Sort tabular file with sort(1), returns the filename of the sorted file.
- TODO: use separate /fast/tmp for sort.
+ Sort tabular file with sort(1), returns the filename of the sorted
+ file. Options to sort can be passed in via opts keyword argument.
"""
with tempfile.NamedTemporaryFile(delete=False, mode=mode, prefix=self.prefix) as tf:
env = os.environ.copy()
@@ -433,11 +406,11 @@ class Cluster:
key: Callable[[Any], str] = None) -> Generator[Any, None, None]:
"""
Extract a key from elements of an iterable and group them. Just as
- uniq(1), the iterable must be ordered (by the key that is extracted)
- for this to work.
+ uniq(1), the input iterable must be ordered (by the key that is
+ extracted) for this to work.
- There might be large clusters, which would currently exceed memory.
- Mitigate by splitting large clusters into parts.
+ There might be large clusters, which would currently exceed memory,
+ hence the max_cluster_size option.
"""
for k, g in itertools.groupby(seq, key=key):
payload = []
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index eeafaf6..c322f39 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -189,8 +189,8 @@ def compare(a, b):
# TODO: figshare versions, "xxx.v1"
FIGSHARE_PREFIX = "10.6084"
- if a.get("doi") and b.get("doi") and a.get("doi").startswith(FIGSHARE_PREFIX + "/") and b.get("doi").startswith(FIGSHARE_PREFIX +
- "/"):
+ if a.get("doi") and b.get("doi") and a.get("doi").startswith(FIGSHARE_PREFIX + "/") and b.get(
+ "doi").startswith(FIGSHARE_PREFIX + "/"):
a_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("doi"))
b_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", a.get("doi"))
if a_doi_v_stripped == b_doi_v_stripped: