we need to build a list first, then convert into db

author: Martin Czygan <martin.czygan@gmail.com> 2020-11-07 00:42:56 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-11-07 00:42:56 +0100
commit: e5c8e80fa246899fe95008fe7b599b6efe0e686e (patch)
tree: 23f28ac569a2d2c135af2eacfc3e5b55a4fa863d /fuzzycat
parent: 9366af90058d14b1ca046ad89987ee8bade3c003 (diff)
download: fuzzycat-e5c8e80fa246899fe95008fe7b599b6efe0e686e.tar.gz
fuzzycat-e5c8e80fa246899fe95008fe7b599b6efe0e686e.zip
1 files changed, 18 insertions, 30 deletions
diff --git a/fuzzycat/build.py b/fuzzycat/build.py
index e20ca32..a37453d 100644
--- a/fuzzycat/build.py
+++ b/fuzzycat/build.py
@@ -3,6 +3,7 @@ Build auxiliary data structures.
 """
 
 import fileinput
+import operator
 import sqlite3
 import string
 import sys
@@ -80,37 +81,24 @@ class NgramLookup:
 
     Need to write out all data, the sort, the finalize as db.
     """
-    def __init__(self, files="-", output="data.db"):
+    def __init__(self, files="-", output="data.db", n=3):
         self.files = files
         self.output = output
-        self.stopwords = stopwords.words('english') + list(string.punctuation) + ["'", '"', "''"]
+        self.stopwords = stopwords.words('english') + list(
+            string.punctuation) + ["'", '"', "''", "``", "'s", "→"]
+        self.n = n
 
     def run(self):
-        _, filename = tempfile.mkstemp()
-        with sqlitedb(filename) as cursor:
-            cursor.execute("""
-CREATE TABLE IF NOT EXISTS sslookup (
-    id INTEGER PRIMARY KEY,
-	title_prefix TEXT, title_suffix TEXT, contribs TEXT);
-            """)
-            cursor.execute("CREATE INDEX idx_sslookup_title ON sslookup (title_prefix, title_suffix);")
-            cursor.execute("CREATE INDEX idx_sslookup_title_prefix ON sslookup (title_prefix);")
-            cursor.execute("CREATE INDEX idx_sslookup_title_suffix ON sslookup (title_suffix);")
-
-        print("temp db at {}".format(filename))
-        with sqlitedb(filename) as cursor:
-            batch = []
-            for i, line in enumerate(fileinput.input(files=self.files)):
-                if i % 10000 == 0:
-                    print("@{} inserting batch {}".format(i, len(batch), file=sys.stderr))
-                    cursor.executemany("insert into sslookup(title_prefix, title_suffix) values(?, ?)", batch)
-                try:
-                    doc = json.loads(line)
-                    title = doc["title"]
-                    tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords]
-                    # self.output.write(json.dumps(tokens).decode("utf-8") + "\n")
-                    prefix = "-".join(tokens[:3])
-                    suffix = "-".join(tokens[-3:])
-                    batch.append((prefix, suffix))
-                except KeyError:
-                    print("skipping doc w/o title: {}".format(line), file=sys.stderr)
+        fast_fields = operator.itemgetter("ident", "title")
+        for i, line in enumerate(fileinput.input(files=self.files)):
+            if i % 10000 == 0:
+                print("@{}".format(i), file=sys.stderr)
+            try:
+                doc = json.loads(line)
+                id, title = fast_fields(doc)
+                tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords]
+                prefix = "-".join(tokens[:self.n])
+                suffix = "-".join(tokens[-self.n:])
+                print("{}\t{}\t{}".format(id, prefix, suffix))
+            except KeyError as exc:
+                print("skipping doc w/o title: {} - {}".format(line, exc), file=sys.stderr)
author	Martin Czygan <martin.czygan@gmail.com>	2020-11-07 00:42:56 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-11-07 00:42:56 +0100
commit	e5c8e80fa246899fe95008fe7b599b6efe0e686e (patch)
tree	23f28ac569a2d2c135af2eacfc3e5b55a4fa863d /fuzzycat
parent	9366af90058d14b1ca046ad89987ee8bade3c003 (diff)
download	fuzzycat-e5c8e80fa246899fe95008fe7b599b6efe0e686e.tar.gz fuzzycat-e5c8e80fa246899fe95008fe7b599b6efe0e686e.zip