diff options
-rw-r--r-- | fuzzycat/cluster.py | 54 | ||||
-rw-r--r-- | notes/bm.md | 19 |
2 files changed, 73 insertions, 0 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index ee23979..e4a36bf 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -1,6 +1,60 @@ # pylint: disable=C0103 """ Clustering stage. + +* [ ] verify needs whole document +* [ ] parallelization misses groups +* [ ] cached match key store (sqlite3), something ~/.cache/... +* [ ] reproducibly run test +* [ ] place for put md record tests + +---- + +* [ ] hadoop -> py (bn) +* [ ] gnu parallel, share command line -- note (bn) + +---- + +Ideas: + +* lookup potential matches; TSV [key, ...]; sort +* maybe new "schema" - size vs "common schema" -- key <TAB> {"bibjson": ...} +* merge-join + +``` +$ fuzzycat.main keygen -s "algo" < ours | sort -k1,1 > a.tsv +$ fuzzycat.main keygen -s "algo" < other | sort -k1,1 > b.tsv +$ merge-join a.tsv b.tsv +``` + +A couple of "keygen" algos. + +> 10k/s, 1B, ~day + +Partial fields should be ok. + +Q: + +* nysiis + +Deps. + +* pydantic; json "omitempty" -- get rid of it? +* orjson (serialize datetime) -- maybe enough w/ dataclasses w/ dataclasses + +fuzzycat.main -> `__main__.py` + +* elasticsearch-py >> elasticsearch + +Matching releases to non-release entities. + +---- + +Features and integration. + +* work grouping at import time; random pdfs; requires strong verification (vs cgraph) +* email out to OCI + """ import collections diff --git a/notes/bm.md b/notes/bm.md new file mode 100644 index 0000000..b6c3a7c --- /dev/null +++ b/notes/bm.md @@ -0,0 +1,19 @@ +# b/m + +## cluster, verify + +* git pull deploy, aitio +* cluster example +* test with + +## regatedl match results + +* https://git.archive.org/martin/regatedl, in fixtures: https://git.archive.org/martin/regatedl/-/tree/master/fixtures + +## the temp data structure + +* should go in ~/.cache/... +* sqlite; TSV + +## tigris ideas + |