wip: aux lists and dbs

author: Martin Czygan <martin.czygan@gmail.com> 2020-11-07 00:33:56 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-11-07 00:33:56 +0100
commit: 9366af90058d14b1ca046ad89987ee8bade3c003 (patch)
tree: 396fdbdfd5c468834aafa0ffecd48fec23e22e36
parent: bafb146d7872be4719aa3c4ab5dba45e571eae1a (diff)
download: fuzzycat-9366af90058d14b1ca046ad89987ee8bade3c003.tar.gz
fuzzycat-9366af90058d14b1ca046ad89987ee8bade3c003.zip
4 files changed, 161 insertions, 9 deletions
diff --git a/fuzzycat/build.py b/fuzzycat/build.py
new file mode 100644
index 0000000..e20ca32
--- /dev/null
+++ b/fuzzycat/build.py
@@ -0,0 +1,116 @@
+"""
+Build auxiliary data structures.
+"""
+
+import fileinput
+import sqlite3
+import string
+import sys
+import tempfile
+
+import orjson as json
+from nltk import word_tokenize
+from nltk.corpus import stopwords
+
+__all__ = [
+    "sqlite3db",
+    "NgramLookup",
+]
+
+
+class sqlitedb():
+    """
+    Simple cursor context manager for sqlite3 databases. Commits everything at exit.
+
+        with sqlitedb('/tmp/test.db') as cursor:
+            query = cursor.execute('SELECT * FROM items')
+            result = query.fetchall()
+    """
+    def __init__(self, path, timeout=5.0, detect_types=0):
+        self.path = path
+        self.conn = None
+        self.cursor = None
+        self.timeout = timeout
+        self.detect_types = detect_types
+
+    def __enter__(self):
+        self.conn = sqlite3.connect(self.path, timeout=self.timeout, detect_types=self.detect_types)
+        self.conn.text_factory = str
+        self.cursor = self.conn.cursor()
+        return self.cursor
+
+    def __exit__(self, exc_class, exc, traceback):
+        self.conn.commit()
+        self.conn.close()
+
+
+class TitleTokenList:
+    """
+    Build title token list.
+    """
+    def __init__(self, files="-", output=sys.stdout):
+        self.files = files
+        self.output = output
+        self.stopwords = stopwords.words('english') + list(
+            string.punctuation) + ["'", '"', "''", "`", "``"]
+
+    def run(self):
+        for i, line in enumerate(fileinput.input(files=self.files)):
+            if i % 1000000 == 0:
+                print("@{}".format(i), file=sys.stderr)
+            try:
+                doc = json.loads(line)
+                title = doc["title"]
+                tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords]
+                self.output.write(json.dumps(tokens).decode("utf-8") + "\n")
+            except KeyError:
+                print("skipping doc w/o title: {}".format(line), file=sys.stderr)
+
+
+class NgramLookup:
+    """
+    Outline:
+
+    * tokenize title
+    * remove stopwords
+    * take first N, last N
+    * tokenize first author
+
+    Build aux sqlite3 db.
+
+    Need to write out all data, the sort, the finalize as db.
+    """
+    def __init__(self, files="-", output="data.db"):
+        self.files = files
+        self.output = output
+        self.stopwords = stopwords.words('english') + list(string.punctuation) + ["'", '"', "''"]
+
+    def run(self):
+        _, filename = tempfile.mkstemp()
+        with sqlitedb(filename) as cursor:
+            cursor.execute("""
+CREATE TABLE IF NOT EXISTS sslookup (
+    id INTEGER PRIMARY KEY,
+	title_prefix TEXT, title_suffix TEXT, contribs TEXT);
+            """)
+            cursor.execute("CREATE INDEX idx_sslookup_title ON sslookup (title_prefix, title_suffix);")
+            cursor.execute("CREATE INDEX idx_sslookup_title_prefix ON sslookup (title_prefix);")
+            cursor.execute("CREATE INDEX idx_sslookup_title_suffix ON sslookup (title_suffix);")
+
+        print("temp db at {}".format(filename))
+        with sqlitedb(filename) as cursor:
+            batch = []
+            for i, line in enumerate(fileinput.input(files=self.files)):
+                if i % 10000 == 0:
+                    print("@{} inserting batch {}".format(i, len(batch), file=sys.stderr))
+                    cursor.executemany("insert into sslookup(title_prefix, title_suffix) values(?, ?)", batch)
+                try:
+                    doc = json.loads(line)
+                    title = doc["title"]
+                    tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords]
+                    # self.output.write(json.dumps(tokens).decode("utf-8") + "\n")
+                    prefix = "-".join(tokens[:3])
+                    suffix = "-".join(tokens[-3:])
+                    batch.append((prefix, suffix))
+                except KeyError:
+                    print("skipping doc w/o title: {}".format(line), file=sys.stderr)
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 755e94f..db20320 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -81,15 +81,16 @@ def release_key_title_authors_ngram(doc: KeyDoc) -> Tuple[str, str]:
     """
     Derive a key from title and authors. Authors in contribs list:
 
-      "contribs": [
-	    {
-	      "index": 0,
-	      "raw_name": "Meise Botanic Garden",
-	      "role": "author"
-	    }
-	],
-
+    "contribs": [
+        {
+            "index": 0,
+            "raw_name": "Meise Botanic Garden",
+            "role": "author"
+        }
+    ],
 
+    Tokenize title, remote stopwords, lookup first three, lookup last three,
+    plus authors.
     """
     # SS: compare ngram sets?
 
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index d735a04..dfc0925 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -21,6 +21,7 @@ import tempfile
 
 import orjson as json
 
+from fuzzycat.build import NgramLookup, TitleTokenList
 from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
                               release_key_title_nysiis)
 
@@ -44,6 +45,17 @@ def run_verify(args):
     print('verify')
 
 
+def run_build(args):
+    if args.type == "ss":
+        builder = NgramLookup(files=args.files, output=args.output)
+        builder.run()
+    elif args.type == "tt":
+        builder = TitleTokenList(files=args.files, output=args.output)
+        builder.run()
+    else:
+        raise NotImplementedError()
+
+
 if __name__ == '__main__':
     logging.basicConfig(
         level=logging.DEBUG,
@@ -62,7 +74,7 @@ if __name__ == '__main__':
 
     sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser])
     sub_cluster.set_defaults(func=run_cluster)
-    sub_cluster.add_argument('-f', '--files', default="-", help='output files')
+    sub_cluster.add_argument('-f', '--files', default="-", help='input files')
     sub_cluster.add_argument('-t',
                              '--type',
                              default='title',
@@ -71,6 +83,16 @@ if __name__ == '__main__':
     sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser])
     sub_verify.set_defaults(func=run_verify)
 
+    sub_build = subparsers.add_parser('build', help='build auxiliary databases', parents=[parser])
+    sub_build.add_argument('-f', '--files', default="-", help='input files')
+    sub_build.add_argument('-t', '--type', default="ss", help='type of database to build')
+    sub_build.add_argument('-o',
+                           '--output',
+                           type=argparse.FileType('w'),
+                           default=sys.stdout,
+                           help='output file')
+    sub_build.set_defaults(func=run_build)
+
     args = parser.parse_args()
     if not args.__dict__.get("func"):
         print(__doc__, file=sys.stderr)
diff --git a/sql/sslookup.sql b/sql/sslookup.sql
new file mode 100644
index 0000000..768e40a
--- /dev/null
+++ b/sql/sslookup.sql
@@ -0,0 +1,13 @@
+
+-- Example: Low-energy nanotube chip says 'hello world'
+CREATE TABLE IF NOT EXISTS sslookup (
+    id INTEGER PRIMARY KEY,
+	title_prefix TEXT,
+    title_suffix TEXT,
+    contribs TEXT
+);
+
+CREATE INDEX idx_sslookup_title ON sslookup (title_prefix, title_suffix);
+CREATE INDEX idx_sslookup_title_prefix ON sslookup (title_prefix);
+CREATE INDEX idx_sslookup_title_suffix ON sslookup (title_suffix);
+
author	Martin Czygan <martin.czygan@gmail.com>	2020-11-07 00:33:56 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-11-07 00:33:56 +0100
commit	9366af90058d14b1ca046ad89987ee8bade3c003 (patch)
tree	396fdbdfd5c468834aafa0ffecd48fec23e22e36
parent	bafb146d7872be4719aa3c4ab5dba45e571eae1a (diff)
download	fuzzycat-9366af90058d14b1ca046ad89987ee8bade3c003.tar.gz fuzzycat-9366af90058d14b1ca046ad89987ee8bade3c003.zip