diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-07 00:33:56 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-07 00:33:56 +0100 |
commit | 9366af90058d14b1ca046ad89987ee8bade3c003 (patch) | |
tree | 396fdbdfd5c468834aafa0ffecd48fec23e22e36 | |
parent | bafb146d7872be4719aa3c4ab5dba45e571eae1a (diff) | |
download | fuzzycat-9366af90058d14b1ca046ad89987ee8bade3c003.tar.gz fuzzycat-9366af90058d14b1ca046ad89987ee8bade3c003.zip |
wip: aux lists and dbs
-rw-r--r-- | fuzzycat/build.py | 116 | ||||
-rw-r--r-- | fuzzycat/cluster.py | 17 | ||||
-rw-r--r-- | fuzzycat/main.py | 24 | ||||
-rw-r--r-- | sql/sslookup.sql | 13 |
4 files changed, 161 insertions, 9 deletions
diff --git a/fuzzycat/build.py b/fuzzycat/build.py new file mode 100644 index 0000000..e20ca32 --- /dev/null +++ b/fuzzycat/build.py @@ -0,0 +1,116 @@ +""" +Build auxiliary data structures. +""" + +import fileinput +import sqlite3 +import string +import sys +import tempfile + +import orjson as json +from nltk import word_tokenize +from nltk.corpus import stopwords + +__all__ = [ + "sqlite3db", + "NgramLookup", +] + + +class sqlitedb(): + """ + Simple cursor context manager for sqlite3 databases. Commits everything at exit. + + with sqlitedb('/tmp/test.db') as cursor: + query = cursor.execute('SELECT * FROM items') + result = query.fetchall() + """ + def __init__(self, path, timeout=5.0, detect_types=0): + self.path = path + self.conn = None + self.cursor = None + self.timeout = timeout + self.detect_types = detect_types + + def __enter__(self): + self.conn = sqlite3.connect(self.path, timeout=self.timeout, detect_types=self.detect_types) + self.conn.text_factory = str + self.cursor = self.conn.cursor() + return self.cursor + + def __exit__(self, exc_class, exc, traceback): + self.conn.commit() + self.conn.close() + + +class TitleTokenList: + """ + Build title token list. + """ + def __init__(self, files="-", output=sys.stdout): + self.files = files + self.output = output + self.stopwords = stopwords.words('english') + list( + string.punctuation) + ["'", '"', "''", "`", "``"] + + def run(self): + for i, line in enumerate(fileinput.input(files=self.files)): + if i % 1000000 == 0: + print("@{}".format(i), file=sys.stderr) + try: + doc = json.loads(line) + title = doc["title"] + tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords] + self.output.write(json.dumps(tokens).decode("utf-8") + "\n") + except KeyError: + print("skipping doc w/o title: {}".format(line), file=sys.stderr) + + +class NgramLookup: + """ + Outline: + + * tokenize title + * remove stopwords + * take first N, last N + * tokenize first author + + Build aux sqlite3 db. + + Need to write out all data, the sort, the finalize as db. + """ + def __init__(self, files="-", output="data.db"): + self.files = files + self.output = output + self.stopwords = stopwords.words('english') + list(string.punctuation) + ["'", '"', "''"] + + def run(self): + _, filename = tempfile.mkstemp() + with sqlitedb(filename) as cursor: + cursor.execute(""" +CREATE TABLE IF NOT EXISTS sslookup ( + id INTEGER PRIMARY KEY, + title_prefix TEXT, title_suffix TEXT, contribs TEXT); + """) + cursor.execute("CREATE INDEX idx_sslookup_title ON sslookup (title_prefix, title_suffix);") + cursor.execute("CREATE INDEX idx_sslookup_title_prefix ON sslookup (title_prefix);") + cursor.execute("CREATE INDEX idx_sslookup_title_suffix ON sslookup (title_suffix);") + + print("temp db at {}".format(filename)) + with sqlitedb(filename) as cursor: + batch = [] + for i, line in enumerate(fileinput.input(files=self.files)): + if i % 10000 == 0: + print("@{} inserting batch {}".format(i, len(batch), file=sys.stderr)) + cursor.executemany("insert into sslookup(title_prefix, title_suffix) values(?, ?)", batch) + try: + doc = json.loads(line) + title = doc["title"] + tokens = [tok for tok in word_tokenize(title.lower()) if tok not in self.stopwords] + # self.output.write(json.dumps(tokens).decode("utf-8") + "\n") + prefix = "-".join(tokens[:3]) + suffix = "-".join(tokens[-3:]) + batch.append((prefix, suffix)) + except KeyError: + print("skipping doc w/o title: {}".format(line), file=sys.stderr) diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 755e94f..db20320 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -81,15 +81,16 @@ def release_key_title_authors_ngram(doc: KeyDoc) -> Tuple[str, str]: """ Derive a key from title and authors. Authors in contribs list: - "contribs": [ - { - "index": 0, - "raw_name": "Meise Botanic Garden", - "role": "author" - } - ], - + "contribs": [ + { + "index": 0, + "raw_name": "Meise Botanic Garden", + "role": "author" + } + ], + Tokenize title, remote stopwords, lookup first three, lookup last three, + plus authors. """ # SS: compare ngram sets? diff --git a/fuzzycat/main.py b/fuzzycat/main.py index d735a04..dfc0925 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -21,6 +21,7 @@ import tempfile import orjson as json +from fuzzycat.build import NgramLookup, TitleTokenList from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized, release_key_title_nysiis) @@ -44,6 +45,17 @@ def run_verify(args): print('verify') +def run_build(args): + if args.type == "ss": + builder = NgramLookup(files=args.files, output=args.output) + builder.run() + elif args.type == "tt": + builder = TitleTokenList(files=args.files, output=args.output) + builder.run() + else: + raise NotImplementedError() + + if __name__ == '__main__': logging.basicConfig( level=logging.DEBUG, @@ -62,7 +74,7 @@ if __name__ == '__main__': sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser]) sub_cluster.set_defaults(func=run_cluster) - sub_cluster.add_argument('-f', '--files', default="-", help='output files') + sub_cluster.add_argument('-f', '--files', default="-", help='input files') sub_cluster.add_argument('-t', '--type', default='title', @@ -71,6 +83,16 @@ if __name__ == '__main__': sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser]) sub_verify.set_defaults(func=run_verify) + sub_build = subparsers.add_parser('build', help='build auxiliary databases', parents=[parser]) + sub_build.add_argument('-f', '--files', default="-", help='input files') + sub_build.add_argument('-t', '--type', default="ss", help='type of database to build') + sub_build.add_argument('-o', + '--output', + type=argparse.FileType('w'), + default=sys.stdout, + help='output file') + sub_build.set_defaults(func=run_build) + args = parser.parse_args() if not args.__dict__.get("func"): print(__doc__, file=sys.stderr) diff --git a/sql/sslookup.sql b/sql/sslookup.sql new file mode 100644 index 0000000..768e40a --- /dev/null +++ b/sql/sslookup.sql @@ -0,0 +1,13 @@ + +-- Example: Low-energy nanotube chip says 'hello world' +CREATE TABLE IF NOT EXISTS sslookup ( + id INTEGER PRIMARY KEY, + title_prefix TEXT, + title_suffix TEXT, + contribs TEXT +); + +CREATE INDEX idx_sslookup_title ON sslookup (title_prefix, title_suffix); +CREATE INDEX idx_sslookup_title_prefix ON sslookup (title_prefix); +CREATE INDEX idx_sslookup_title_suffix ON sslookup (title_suffix); + |