aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/build.py102
1 files changed, 0 insertions, 102 deletions
diff --git a/fuzzycat/build.py b/fuzzycat/build.py
deleted file mode 100644
index 49a061f..0000000
--- a/fuzzycat/build.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""
-WIP: Build auxiliary data structures for lookup.
-"""
-
-import fileinput
-import json
-import operator
-import sqlite3
-import string
-import sys
-import tempfile
-
-from nltk import word_tokenize
-from nltk.corpus import stopwords
-
-__all__ = [
- "sqlite3db",
- "NgramLookup",
-]
-
-
-class sqlitedb():
- """
- Simple cursor context manager for sqlite3 databases. Commits everything at exit.
-
- with sqlitedb('/tmp/test.db') as cursor:
- query = cursor.execute('SELECT * FROM items')
- result = query.fetchall()
- """
- def __init__(self, path, timeout=5.0, detect_types=0):
- self.path = path
- self.conn = None
- self.cursor = None
- self.timeout = timeout
- self.detect_types = detect_types
-
- def __enter__(self):
- self.conn = sqlite3.connect(self.path, timeout=self.timeout, detect_types=self.detect_types)
- self.conn.text_factory = str
- self.cursor = self.conn.cursor()
- return self.cursor
-
- def __exit__(self, exc_class, exc, traceback):
- self.conn.commit()
- self.conn.close()
-
-
-class TitleTokenList:
- """
- Build title token list.
- """
- def __init__(self, files="-", output=sys.stdout):
- self.files = files
- self.output = output
- self.stopwords = stopwords.words('english') + list(
- string.punctuation) + ["'", '"', "''", "`", "``"]
-
- def run(self):
- for i, line in enumerate(fileinput.input(files=self.files)):
- if i % 1000000 == 0:
- print("@{}".format(i), file=sys.stderr)
- try:
- doc = json.loads(line)
- title = doc["title"]
- tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords]
- self.output.write(json.dumps(tokens).decode("utf-8") + "\n")
- except KeyError:
- print("skipping doc w/o title: {}".format(line), file=sys.stderr)
-
-
-class NgramLookup:
- """
- Outline:
-
- * tokenize title
- * remove stopwords
- * take first N, last N
- * tokenize first author
-
- Build aux sqlite3 db.
-
- Need to write out all data, the sort, the finalize as db.
- """
- def __init__(self, files="-", output="data.db", n=3):
- self.files = files
- self.output = output
- self.stopwords = stopwords.words('english') + list(
- string.punctuation) + ["'", '"', "''", "``", "'s", "→"]
- self.n = n
-
- def run(self):
- fast_fields = operator.itemgetter("ident", "title")
- for line in fileinput.input(files=self.files):
- try:
- doc = json.loads(line)
- id, title = fast_fields(doc)
- tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords]
- prefix = "-".join(tokens[:self.n])
- suffix = "-".join(tokens[-self.n:])
- print("{}\t{}-{}".format(id, prefix, suffix))
- except KeyError as exc:
- print("skipping doc w/o title: {} - {}".format(line, exc), file=sys.stderr)