drop build module for now

author: Martin Czygan <martin.czygan@gmail.com> 2020-11-25 01:30:40 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-11-25 01:30:40 +0100
commit: 56e98d87439fd537cbc118bf30d10e66a8a9eced (patch)
tree: 2ee69811fd0614840e895cdfddc293adc3656219
parent: 16c0bbd6339aadad8b994867ba05a44a0b326a25 (diff)
download: fuzzycat-56e98d87439fd537cbc118bf30d10e66a8a9eced.tar.gz
fuzzycat-56e98d87439fd537cbc118bf30d10e66a8a9eced.zip
1 files changed, 0 insertions, 102 deletions
diff --git a/fuzzycat/build.py b/fuzzycat/build.py
deleted file mode 100644
index 49a061f..0000000
--- a/fuzzycat/build.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""
-WIP: Build auxiliary data structures for lookup.
-"""
-
-import fileinput
-import json
-import operator
-import sqlite3
-import string
-import sys
-import tempfile
-
-from nltk import word_tokenize
-from nltk.corpus import stopwords
-
-__all__ = [
-    "sqlite3db",
-    "NgramLookup",
-]
-
-
-class sqlitedb():
-    """
-    Simple cursor context manager for sqlite3 databases. Commits everything at exit.
-
-        with sqlitedb('/tmp/test.db') as cursor:
-            query = cursor.execute('SELECT * FROM items')
-            result = query.fetchall()
-    """
-    def __init__(self, path, timeout=5.0, detect_types=0):
-        self.path = path
-        self.conn = None
-        self.cursor = None
-        self.timeout = timeout
-        self.detect_types = detect_types
-
-    def __enter__(self):
-        self.conn = sqlite3.connect(self.path, timeout=self.timeout, detect_types=self.detect_types)
-        self.conn.text_factory = str
-        self.cursor = self.conn.cursor()
-        return self.cursor
-
-    def __exit__(self, exc_class, exc, traceback):
-        self.conn.commit()
-        self.conn.close()
-
-
-class TitleTokenList:
-    """
-    Build title token list.
-    """
-    def __init__(self, files="-", output=sys.stdout):
-        self.files = files
-        self.output = output
-        self.stopwords = stopwords.words('english') + list(
-            string.punctuation) + ["'", '"', "''", "`", "``"]
-
-    def run(self):
-        for i, line in enumerate(fileinput.input(files=self.files)):
-            if i % 1000000 == 0:
-                print("@{}".format(i), file=sys.stderr)
-            try:
-                doc = json.loads(line)
-                title = doc["title"]
-                tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords]
-                self.output.write(json.dumps(tokens).decode("utf-8") + "\n")
-            except KeyError:
-                print("skipping doc w/o title: {}".format(line), file=sys.stderr)
-
-
-class NgramLookup:
-    """
-    Outline:
-
-    * tokenize title
-    * remove stopwords
-    * take first N, last N
-    * tokenize first author
-
-    Build aux sqlite3 db.
-
-    Need to write out all data, the sort, the finalize as db.
-    """
-    def __init__(self, files="-", output="data.db", n=3):
-        self.files = files
-        self.output = output
-        self.stopwords = stopwords.words('english') + list(
-            string.punctuation) + ["'", '"', "''", "``", "'s", "→"]
-        self.n = n
-
-    def run(self):
-        fast_fields = operator.itemgetter("ident", "title")
-        for line in fileinput.input(files=self.files):
-            try:
-                doc = json.loads(line)
-                id, title = fast_fields(doc)
-                tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords]
-                prefix = "-".join(tokens[:self.n])
-                suffix = "-".join(tokens[-self.n:])
-                print("{}\t{}-{}".format(id, prefix, suffix))
-            except KeyError as exc:
-                print("skipping doc w/o title: {} - {}".format(line, exc), file=sys.stderr)
author	Martin Czygan <martin.czygan@gmail.com>	2020-11-25 01:30:40 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-11-25 01:30:40 +0100
commit	56e98d87439fd537cbc118bf30d10e66a8a9eced (patch)
tree	2ee69811fd0614840e895cdfddc293adc3656219
parent	16c0bbd6339aadad8b994867ba05a44a0b326a25 (diff)
download	fuzzycat-56e98d87439fd537cbc118bf30d10e66a8a9eced.tar.gz fuzzycat-56e98d87439fd537cbc118bf30d10e66a8a9eced.zip