diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-07 00:42:56 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-07 00:42:56 +0100 |
commit | e5c8e80fa246899fe95008fe7b599b6efe0e686e (patch) | |
tree | 23f28ac569a2d2c135af2eacfc3e5b55a4fa863d | |
parent | 9366af90058d14b1ca046ad89987ee8bade3c003 (diff) | |
download | fuzzycat-e5c8e80fa246899fe95008fe7b599b6efe0e686e.tar.gz fuzzycat-e5c8e80fa246899fe95008fe7b599b6efe0e686e.zip |
we need to build a list first, then convert into db
-rw-r--r-- | fuzzycat/build.py | 48 |
1 files changed, 18 insertions, 30 deletions
diff --git a/fuzzycat/build.py b/fuzzycat/build.py index e20ca32..a37453d 100644 --- a/fuzzycat/build.py +++ b/fuzzycat/build.py @@ -3,6 +3,7 @@ Build auxiliary data structures. """ import fileinput +import operator import sqlite3 import string import sys @@ -80,37 +81,24 @@ class NgramLookup: Need to write out all data, the sort, the finalize as db. """ - def __init__(self, files="-", output="data.db"): + def __init__(self, files="-", output="data.db", n=3): self.files = files self.output = output - self.stopwords = stopwords.words('english') + list(string.punctuation) + ["'", '"', "''"] + self.stopwords = stopwords.words('english') + list( + string.punctuation) + ["'", '"', "''", "``", "'s", "→"] + self.n = n def run(self): - _, filename = tempfile.mkstemp() - with sqlitedb(filename) as cursor: - cursor.execute(""" -CREATE TABLE IF NOT EXISTS sslookup ( - id INTEGER PRIMARY KEY, - title_prefix TEXT, title_suffix TEXT, contribs TEXT); - """) - cursor.execute("CREATE INDEX idx_sslookup_title ON sslookup (title_prefix, title_suffix);") - cursor.execute("CREATE INDEX idx_sslookup_title_prefix ON sslookup (title_prefix);") - cursor.execute("CREATE INDEX idx_sslookup_title_suffix ON sslookup (title_suffix);") - - print("temp db at {}".format(filename)) - with sqlitedb(filename) as cursor: - batch = [] - for i, line in enumerate(fileinput.input(files=self.files)): - if i % 10000 == 0: - print("@{} inserting batch {}".format(i, len(batch), file=sys.stderr)) - cursor.executemany("insert into sslookup(title_prefix, title_suffix) values(?, ?)", batch) - try: - doc = json.loads(line) - title = doc["title"] - tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords] - # self.output.write(json.dumps(tokens).decode("utf-8") + "\n") - prefix = "-".join(tokens[:3]) - suffix = "-".join(tokens[-3:]) - batch.append((prefix, suffix)) - except KeyError: - print("skipping doc w/o title: {}".format(line), file=sys.stderr) + fast_fields = operator.itemgetter("ident", "title") + for i, line in enumerate(fileinput.input(files=self.files)): + if i % 10000 == 0: + print("@{}".format(i), file=sys.stderr) + try: + doc = json.loads(line) + id, title = fast_fields(doc) + tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords] + prefix = "-".join(tokens[:self.n]) + suffix = "-".join(tokens[-self.n:]) + print("{}\t{}\t{}".format(id, prefix, suffix)) + except KeyError as exc: + print("skipping doc w/o title: {} - {}".format(line, exc), file=sys.stderr) |