From 56e98d87439fd537cbc118bf30d10e66a8a9eced Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 25 Nov 2020 01:30:40 +0100
Subject: drop build module for now

---
 fuzzycat/build.py | 102 ------------------------------------------------------
 1 file changed, 102 deletions(-)
 delete mode 100644 fuzzycat/build.py

diff --git a/fuzzycat/build.py b/fuzzycat/build.py
deleted file mode 100644
index 49a061f..0000000
--- a/fuzzycat/build.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""
-WIP: Build auxiliary data structures for lookup.
-"""
-
-import fileinput
-import json
-import operator
-import sqlite3
-import string
-import sys
-import tempfile
-
-from nltk import word_tokenize
-from nltk.corpus import stopwords
-
-__all__ = [
-    "sqlite3db",
-    "NgramLookup",
-]
-
-
-class sqlitedb():
-    """
-    Simple cursor context manager for sqlite3 databases. Commits everything at exit.
-
-        with sqlitedb('/tmp/test.db') as cursor:
-            query = cursor.execute('SELECT * FROM items')
-            result = query.fetchall()
-    """
-    def __init__(self, path, timeout=5.0, detect_types=0):
-        self.path = path
-        self.conn = None
-        self.cursor = None
-        self.timeout = timeout
-        self.detect_types = detect_types
-
-    def __enter__(self):
-        self.conn = sqlite3.connect(self.path, timeout=self.timeout, detect_types=self.detect_types)
-        self.conn.text_factory = str
-        self.cursor = self.conn.cursor()
-        return self.cursor
-
-    def __exit__(self, exc_class, exc, traceback):
-        self.conn.commit()
-        self.conn.close()
-
-
-class TitleTokenList:
-    """
-    Build title token list.
-    """
-    def __init__(self, files="-", output=sys.stdout):
-        self.files = files
-        self.output = output
-        self.stopwords = stopwords.words('english') + list(
-            string.punctuation) + ["'", '"', "''", "`", "``"]
-
-    def run(self):
-        for i, line in enumerate(fileinput.input(files=self.files)):
-            if i % 1000000 == 0:
-                print("@{}".format(i), file=sys.stderr)
-            try:
-                doc = json.loads(line)
-                title = doc["title"]
-                tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords]
-                self.output.write(json.dumps(tokens).decode("utf-8") + "\n")
-            except KeyError:
-                print("skipping doc w/o title: {}".format(line), file=sys.stderr)
-
-
-class NgramLookup:
-    """
-    Outline:
-
-    * tokenize title
-    * remove stopwords
-    * take first N, last N
-    * tokenize first author
-
-    Build aux sqlite3 db.
-
-    Need to write out all data, the sort, the finalize as db.
-    """
-    def __init__(self, files="-", output="data.db", n=3):
-        self.files = files
-        self.output = output
-        self.stopwords = stopwords.words('english') + list(
-            string.punctuation) + ["'", '"', "''", "``", "'s", "→"]
-        self.n = n
-
-    def run(self):
-        fast_fields = operator.itemgetter("ident", "title")
-        for line in fileinput.input(files=self.files):
-            try:
-                doc = json.loads(line)
-                id, title = fast_fields(doc)
-                tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords]
-                prefix = "-".join(tokens[:self.n])
-                suffix = "-".join(tokens[-self.n:])
-                print("{}\t{}-{}".format(id, prefix, suffix))
-            except KeyError as exc:
-                print("skipping doc w/o title: {} - {}".format(line, exc), file=sys.stderr)
-- 
cgit v1.2.3