aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/build.py116
-rw-r--r--fuzzycat/cluster.py17
-rw-r--r--fuzzycat/main.py24
-rw-r--r--sql/sslookup.sql13
4 files changed, 161 insertions, 9 deletions
diff --git a/fuzzycat/build.py b/fuzzycat/build.py
new file mode 100644
index 0000000..e20ca32
--- /dev/null
+++ b/fuzzycat/build.py
@@ -0,0 +1,116 @@
+"""
+Build auxiliary data structures.
+"""
+
+import fileinput
+import sqlite3
+import string
+import sys
+import tempfile
+
+import orjson as json
+from nltk import word_tokenize
+from nltk.corpus import stopwords
+
+__all__ = [
+ "sqlite3db",
+ "NgramLookup",
+]
+
+
+class sqlitedb():
+ """
+ Simple cursor context manager for sqlite3 databases. Commits everything at exit.
+
+ with sqlitedb('/tmp/test.db') as cursor:
+ query = cursor.execute('SELECT * FROM items')
+ result = query.fetchall()
+ """
+ def __init__(self, path, timeout=5.0, detect_types=0):
+ self.path = path
+ self.conn = None
+ self.cursor = None
+ self.timeout = timeout
+ self.detect_types = detect_types
+
+ def __enter__(self):
+ self.conn = sqlite3.connect(self.path, timeout=self.timeout, detect_types=self.detect_types)
+ self.conn.text_factory = str
+ self.cursor = self.conn.cursor()
+ return self.cursor
+
+ def __exit__(self, exc_class, exc, traceback):
+ self.conn.commit()
+ self.conn.close()
+
+
+class TitleTokenList:
+ """
+ Build title token list.
+ """
+ def __init__(self, files="-", output=sys.stdout):
+ self.files = files
+ self.output = output
+ self.stopwords = stopwords.words('english') + list(
+ string.punctuation) + ["'", '"', "''", "`", "``"]
+
+ def run(self):
+ for i, line in enumerate(fileinput.input(files=self.files)):
+ if i % 1000000 == 0:
+ print("@{}".format(i), file=sys.stderr)
+ try:
+ doc = json.loads(line)
+ title = doc["title"]
+ tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords]
+ self.output.write(json.dumps(tokens).decode("utf-8") + "\n")
+ except KeyError:
+ print("skipping doc w/o title: {}".format(line), file=sys.stderr)
+
+
+class NgramLookup:
+ """
+ Outline:
+
+ * tokenize title
+ * remove stopwords
+ * take first N, last N
+ * tokenize first author
+
+ Build aux sqlite3 db.
+
+ Need to write out all data, the sort, the finalize as db.
+ """
+ def __init__(self, files="-", output="data.db"):
+ self.files = files
+ self.output = output
+ self.stopwords = stopwords.words('english') + list(string.punctuation) + ["'", '"', "''"]
+
+ def run(self):
+ _, filename = tempfile.mkstemp()
+ with sqlitedb(filename) as cursor:
+ cursor.execute("""
+CREATE TABLE IF NOT EXISTS sslookup (
+ id INTEGER PRIMARY KEY,
+ title_prefix TEXT, title_suffix TEXT, contribs TEXT);
+ """)
+ cursor.execute("CREATE INDEX idx_sslookup_title ON sslookup (title_prefix, title_suffix);")
+ cursor.execute("CREATE INDEX idx_sslookup_title_prefix ON sslookup (title_prefix);")
+ cursor.execute("CREATE INDEX idx_sslookup_title_suffix ON sslookup (title_suffix);")
+
+ print("temp db at {}".format(filename))
+ with sqlitedb(filename) as cursor:
+ batch = []
+ for i, line in enumerate(fileinput.input(files=self.files)):
+ if i % 10000 == 0:
+ print("@{} inserting batch {}".format(i, len(batch), file=sys.stderr))
+ cursor.executemany("insert into sslookup(title_prefix, title_suffix) values(?, ?)", batch)
+ try:
+ doc = json.loads(line)
+ title = doc["title"]
+ tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords]
+ # self.output.write(json.dumps(tokens).decode("utf-8") + "\n")
+ prefix = "-".join(tokens[:3])
+ suffix = "-".join(tokens[-3:])
+ batch.append((prefix, suffix))
+ except KeyError:
+ print("skipping doc w/o title: {}".format(line), file=sys.stderr)
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 755e94f..db20320 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -81,15 +81,16 @@ def release_key_title_authors_ngram(doc: KeyDoc) -> Tuple[str, str]:
"""
Derive a key from title and authors. Authors in contribs list:
- "contribs": [
- {
- "index": 0,
- "raw_name": "Meise Botanic Garden",
- "role": "author"
- }
- ],
-
+ "contribs": [
+ {
+ "index": 0,
+ "raw_name": "Meise Botanic Garden",
+ "role": "author"
+ }
+ ],
+ Tokenize title, remote stopwords, lookup first three, lookup last three,
+ plus authors.
"""
# SS: compare ngram sets?
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index d735a04..dfc0925 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -21,6 +21,7 @@ import tempfile
import orjson as json
+from fuzzycat.build import NgramLookup, TitleTokenList
from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
release_key_title_nysiis)
@@ -44,6 +45,17 @@ def run_verify(args):
print('verify')
+def run_build(args):
+ if args.type == "ss":
+ builder = NgramLookup(files=args.files, output=args.output)
+ builder.run()
+ elif args.type == "tt":
+ builder = TitleTokenList(files=args.files, output=args.output)
+ builder.run()
+ else:
+ raise NotImplementedError()
+
+
if __name__ == '__main__':
logging.basicConfig(
level=logging.DEBUG,
@@ -62,7 +74,7 @@ if __name__ == '__main__':
sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser])
sub_cluster.set_defaults(func=run_cluster)
- sub_cluster.add_argument('-f', '--files', default="-", help='output files')
+ sub_cluster.add_argument('-f', '--files', default="-", help='input files')
sub_cluster.add_argument('-t',
'--type',
default='title',
@@ -71,6 +83,16 @@ if __name__ == '__main__':
sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser])
sub_verify.set_defaults(func=run_verify)
+ sub_build = subparsers.add_parser('build', help='build auxiliary databases', parents=[parser])
+ sub_build.add_argument('-f', '--files', default="-", help='input files')
+ sub_build.add_argument('-t', '--type', default="ss", help='type of database to build')
+ sub_build.add_argument('-o',
+ '--output',
+ type=argparse.FileType('w'),
+ default=sys.stdout,
+ help='output file')
+ sub_build.set_defaults(func=run_build)
+
args = parser.parse_args()
if not args.__dict__.get("func"):
print(__doc__, file=sys.stderr)
diff --git a/sql/sslookup.sql b/sql/sslookup.sql
new file mode 100644
index 0000000..768e40a
--- /dev/null
+++ b/sql/sslookup.sql
@@ -0,0 +1,13 @@
+
+-- Example: Low-energy nanotube chip says 'hello world'
+CREATE TABLE IF NOT EXISTS sslookup (
+ id INTEGER PRIMARY KEY,
+ title_prefix TEXT,
+ title_suffix TEXT,
+ contribs TEXT
+);
+
+CREATE INDEX idx_sslookup_title ON sslookup (title_prefix, title_suffix);
+CREATE INDEX idx_sslookup_title_prefix ON sslookup (title_prefix);
+CREATE INDEX idx_sslookup_title_suffix ON sslookup (title_suffix);
+