diff options
-rw-r--r-- | notes/approach.dot | 13 | ||||
-rw-r--r-- | notes/approach.png | bin | 40516 -> 0 bytes | |||
-rw-r--r-- | notes/matching_metrics.md | 16 | ||||
-rw-r--r-- | tests/data/release/a5w6vw2pojcbpdqwr4pbljs4my | 33 | ||||
-rw-r--r-- | tests/data/release/c3alu7csn5hyre6fuykhgby7za | 33 | ||||
-rw-r--r-- | tests/data/release/em7vs7m56fhzfh2onmqpihvmdy | 34 | ||||
-rw-r--r-- | tests/data/release/prxzzz63kventfyviovrgyw7bq | 33 | ||||
-rw-r--r-- | tests/data/release/vpnnootnwjc35ovylfvyzjq3fq | 33 | ||||
-rw-r--r-- | tests/data/verify.csv | 10 |
9 files changed, 192 insertions, 13 deletions
diff --git a/notes/approach.dot b/notes/approach.dot deleted file mode 100644 index 0bf3cbb..0000000 --- a/notes/approach.dot +++ /dev/null @@ -1,13 +0,0 @@ -digraph f { - "matching" -> "strings"; - "matching" -> "entities"; - - "strings" -> "lookups"; - "strings" -> "normalization"; - "strings" -> "fuzzy"; - - "entities" -> "identifiers"; - "entities" -> "field subsets"; - - "field subsets" -> "strings"; -} diff --git a/notes/approach.png b/notes/approach.png Binary files differdeleted file mode 100644 index cce18d7..0000000 --- a/notes/approach.png +++ /dev/null diff --git a/notes/matching_metrics.md b/notes/matching_metrics.md new file mode 100644 index 0000000..d37240f --- /dev/null +++ b/notes/matching_metrics.md @@ -0,0 +1,16 @@ +# Matching Metrics + +## Precision/Recall + +For fuzzy matching we want to understand precision and recall. Options for test datasets: + +* manually curated (100s of examples); could determine +* autogenerate slightly different set of real-world metadata (e.g. crossref vs. doaj) converted to releases +* automatically distorted set of records; 1 original, plus N distorted (synthetic) + +## Overall numbers + +* number of clusters per clustering method: "title", "lowercase", "nysiis", + "sandcrawler", a few more - contrastive comparison of these cluster, e.g. how +many more matches/non-matches we get for the various methods +* take N docs from non-clusters and run verify; we would want 100% different/ambiguous results diff --git a/tests/data/release/a5w6vw2pojcbpdqwr4pbljs4my b/tests/data/release/a5w6vw2pojcbpdqwr4pbljs4my new file mode 100644 index 0000000..661e6d4 --- /dev/null +++ b/tests/data/release/a5w6vw2pojcbpdqwr4pbljs4my @@ -0,0 +1,33 @@ +{ + "abstracts": [], + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_name": "Per Brinch Hansen", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1007/978-1-4757-3472-0_11" + }, + "extra": { + "container_name": "The Origin of Concurrent Programming", + "crossref": { + "type": "book-chapter" + } + }, + "ident": "a5w6vw2pojcbpdqwr4pbljs4my", + "pages": "297-318", + "publisher": "Springer New York", + "refs": [], + "release_stage": "published", + "release_type": "chapter", + "release_year": 1975, + "revision": "390de494-9a92-48da-a455-a55894fb8542", + "state": "active", + "title": "The Programming Language Concurrent Pascal", + "work_id": "bjuhfso3prgopfpbvaxpv3rnky" +} diff --git a/tests/data/release/c3alu7csn5hyre6fuykhgby7za b/tests/data/release/c3alu7csn5hyre6fuykhgby7za new file mode 100644 index 0000000..5844801 --- /dev/null +++ b/tests/data/release/c3alu7csn5hyre6fuykhgby7za @@ -0,0 +1,33 @@ +{ + "abstracts": [], + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_name": "Per Brinch Hansen", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1007/978-1-4612-6315-9_19" + }, + "extra": { + "container_name": "Programming Methodology", + "crossref": { + "type": "book-chapter" + } + }, + "ident": "c3alu7csn5hyre6fuykhgby7za", + "pages": "244-261", + "publisher": "Springer New York", + "refs": [], + "release_stage": "published", + "release_type": "chapter", + "release_year": 1978, + "revision": "bd41b695-83b5-4578-aa0f-9fb64127a244", + "state": "active", + "title": "The Programming Language Concurrent Pascal", + "work_id": "iji5kr4eajdn3fcln5eofhi3zm" +} diff --git a/tests/data/release/em7vs7m56fhzfh2onmqpihvmdy b/tests/data/release/em7vs7m56fhzfh2onmqpihvmdy new file mode 100644 index 0000000..183dcfd --- /dev/null +++ b/tests/data/release/em7vs7m56fhzfh2onmqpihvmdy @@ -0,0 +1,34 @@ +{ + "abstracts": [], + "container_id": "dthyqga2onff5nwf4agwqtrhxi", + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_name": "Per Brinch Hansen", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1109/tse.1975.6312840" + }, + "extra": { + "crossref": { + "type": "journal-article" + } + }, + "ident": "em7vs7m56fhzfh2onmqpihvmdy", + "pages": "199-207", + "publisher": "Institute of Electrical and Electronics Engineers (IEEE)", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1975, + "revision": "cb529109-965d-436e-95db-c0d3aa11efc2", + "state": "active", + "title": "The programming language Concurrent Pascal", + "volume": "SE-1", + "work_id": "agze5ddqinhgzcs6pmigf6xk2i" +} diff --git a/tests/data/release/prxzzz63kventfyviovrgyw7bq b/tests/data/release/prxzzz63kventfyviovrgyw7bq new file mode 100644 index 0000000..f868e0a --- /dev/null +++ b/tests/data/release/prxzzz63kventfyviovrgyw7bq @@ -0,0 +1,33 @@ +{ + "abstracts": [], + "container_id": "2w3awgokqne6te4nvlofavy5a4", + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_name": "Per Brinch Hansen", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1007/3-540-07994-7_50" + }, + "extra": { + "crossref": { + "type": "book-chapter" + } + }, + "ident": "prxzzz63kventfyviovrgyw7bq", + "pages": "82-110", + "publisher": "Springer Berlin Heidelberg", + "refs": [], + "release_stage": "published", + "release_type": "chapter", + "release_year": 1976, + "revision": "e1220df2-c0bd-4748-a26b-48af38803349", + "state": "active", + "title": "The programming language concurrent pascal", + "work_id": "qmbmdbqt7jgdlphvkxfm6fuup4" +} diff --git a/tests/data/release/vpnnootnwjc35ovylfvyzjq3fq b/tests/data/release/vpnnootnwjc35ovylfvyzjq3fq new file mode 100644 index 0000000..2ff2735 --- /dev/null +++ b/tests/data/release/vpnnootnwjc35ovylfvyzjq3fq @@ -0,0 +1,33 @@ +{ + "abstracts": [], + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_name": "P. Brinch-Hansen", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1007/978-3-662-09507-2_17" + }, + "extra": { + "container_name": "Programming Languages", + "crossref": { + "type": "book-chapter" + } + }, + "ident": "vpnnootnwjc35ovylfvyzjq3fq", + "pages": "264-272", + "publisher": "Springer Berlin Heidelberg", + "refs": [], + "release_stage": "published", + "release_type": "chapter", + "release_year": 1983, + "revision": "d090a9e6-d2c6-4df9-b25b-88d9e2c962be", + "state": "active", + "title": "The Programming Language Concurrent Pascal", + "work_id": "zpiapu3atzgg3ap235zlo7n3qi" +} diff --git a/tests/data/verify.csv b/tests/data/verify.csv index 03a7047..a830eb8 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -306,3 +306,13 @@ r6fguusjobgythv5wnujnqkw2q,v43g5g7xjbd3lhcsnctfa7f3ue,Status.STRONG,PMID_DOI_PAI kukfjxop6vdtlldynhm57sjpna,pgcuewadijepxllmccksczqwku,Status.DIFFERENT,CONTRIB_INTERSECTION_EMPTY kukfjxop6vdtlldynhm57sjpna,zztbhjw4ljdadhuzraccgbqzl4,, pgcuewadijepxllmccksczqwku,zztbhjw4ljdadhuzraccgbqzl4,Status.DIFFERENT,CONTRIB_INTERSECTION_EMPTY +a5w6vw2pojcbpdqwr4pbljs4my,c3alu7csn5hyre6fuykhgby7za,Status.DIFFERENT,BOOK_CHAPTER +a5w6vw2pojcbpdqwr4pbljs4my,em7vs7m56fhzfh2onmqpihvmdy,Status.EXACT,TITLE_AUTHOR_MATCH +a5w6vw2pojcbpdqwr4pbljs4my,prxzzz63kventfyviovrgyw7bq,, +a5w6vw2pojcbpdqwr4pbljs4my,vpnnootnwjc35ovylfvyzjq3fq,, +c3alu7csn5hyre6fuykhgby7za,em7vs7m56fhzfh2onmqpihvmdy,, +c3alu7csn5hyre6fuykhgby7za,prxzzz63kventfyviovrgyw7bq,, +c3alu7csn5hyre6fuykhgby7za,vpnnootnwjc35ovylfvyzjq3fq,, +em7vs7m56fhzfh2onmqpihvmdy,prxzzz63kventfyviovrgyw7bq,, +em7vs7m56fhzfh2onmqpihvmdy,vpnnootnwjc35ovylfvyzjq3fq,, +prxzzz63kventfyviovrgyw7bq,vpnnootnwjc35ovylfvyzjq3fq,, |