aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-12 12:04:59 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-12 12:04:59 +0100
commit2b1c5407225c03e46ba4649ac2c7898e9ffa01ac (patch)
tree2e5801cfc46b1e33fdc5deae961737bc132138e0 /fuzzycat
parent0f9a45f3c4657eb4372b96e2a6c550c4a01226e5 (diff)
downloadfuzzycat-2b1c5407225c03e46ba4649ac2c7898e9ffa01ac.tar.gz
fuzzycat-2b1c5407225c03e46ba4649ac2c7898e9ffa01ac.zip
reduce custom schema for now
Diffstat (limited to 'fuzzycat')
-rw-r--r--fuzzycat/cluster.py29
1 files changed, 16 insertions, 13 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index cc74deb..440dbe9 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -150,21 +150,16 @@ def release_key_title_normalized(doc: KeyDoc) -> Tuple[str, str]:
def release_key_title_nysiis(doc: KeyDoc) -> Tuple[str, str]:
+ """
+ Use NYSIIS New York State Identification and Intelligence System.
+ """
ident, title = release_key_title(doc)
return (ident, fuzzy.nysiis(title))
def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
"""
- Derive a key from title and authors. Authors in contribs list:
-
- "contribs": [
- {
- "index": 0,
- "raw_name": "Meise Botanic Garden",
- "role": "author"
- }
- ],
+ Derive a key from title.
Tokenize title, remote stopwords, lookup first three, lookup last three,
plus authors. TODO(miku): authors.
@@ -223,6 +218,8 @@ class Cluster:
def run(self):
"""
First map documents to keys, then group by keys.
+
+ Outline: json -> tsv -> sort -> group -> json
"""
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
for line in self.iterable:
@@ -242,7 +239,7 @@ class Cluster:
try:
sf = self.sort(tf.name, opts='-k 2')
with open(sf) as f:
- for doc in self.group_by(f, key=cut(f=1), value=cut(f=0)):
+ for doc in self.group_by(f, key=cut(f=1)):
self.counter["num_clusters"] += 1
json.dump(doc, self.output)
self.output.write("\n")
@@ -270,15 +267,21 @@ class Cluster:
def group_by(self,
seq: collections.abc.Iterable,
- key: Callable[[Any], str] = None,
- value: Callable[[Any], str] = None) -> Generator[Any, None, None]:
+ key: Callable[[Any], str] = None) -> Generator[Any, None, None]:
"""
Extract a key from elements of an iterable and group them. Just as
uniq(1), the iterable must be ordered for this to work.
"""
for k, g in itertools.groupby(seq, key=key):
+ items = list(g)
+ payload = []
+ for line in items:
+ fields = line.split("\t")
+ if len(fields) < 3:
+ continue
+ payload.append(json.loads(fields[2]))
doc = {
"k": k.strip(),
- "v": [value(v) for v in g],
+ "v": payload,
}
yield doc