aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--notes/approach.dot13
-rw-r--r--notes/approach.pngbin40516 -> 0 bytes
-rw-r--r--notes/matching_metrics.md16
-rw-r--r--tests/data/release/a5w6vw2pojcbpdqwr4pbljs4my33
-rw-r--r--tests/data/release/c3alu7csn5hyre6fuykhgby7za33
-rw-r--r--tests/data/release/em7vs7m56fhzfh2onmqpihvmdy34
-rw-r--r--tests/data/release/prxzzz63kventfyviovrgyw7bq33
-rw-r--r--tests/data/release/vpnnootnwjc35ovylfvyzjq3fq33
-rw-r--r--tests/data/verify.csv10
9 files changed, 192 insertions, 13 deletions
diff --git a/notes/approach.dot b/notes/approach.dot
deleted file mode 100644
index 0bf3cbb..0000000
--- a/notes/approach.dot
+++ /dev/null
@@ -1,13 +0,0 @@
-digraph f {
- "matching" -> "strings";
- "matching" -> "entities";
-
- "strings" -> "lookups";
- "strings" -> "normalization";
- "strings" -> "fuzzy";
-
- "entities" -> "identifiers";
- "entities" -> "field subsets";
-
- "field subsets" -> "strings";
-}
diff --git a/notes/approach.png b/notes/approach.png
deleted file mode 100644
index cce18d7..0000000
--- a/notes/approach.png
+++ /dev/null
Binary files differ
diff --git a/notes/matching_metrics.md b/notes/matching_metrics.md
new file mode 100644
index 0000000..d37240f
--- /dev/null
+++ b/notes/matching_metrics.md
@@ -0,0 +1,16 @@
+# Matching Metrics
+
+## Precision/Recall
+
+For fuzzy matching we want to understand precision and recall. Options for test datasets:
+
+* manually curated (100s of examples); could determine
+* autogenerate slightly different set of real-world metadata (e.g. crossref vs. doaj) converted to releases
+* automatically distorted set of records; 1 original, plus N distorted (synthetic)
+
+## Overall numbers
+
+* number of clusters per clustering method: "title", "lowercase", "nysiis",
+ "sandcrawler", a few more - contrastive comparison of these cluster, e.g. how
+many more matches/non-matches we get for the various methods
+* take N docs from non-clusters and run verify; we would want 100% different/ambiguous results
diff --git a/tests/data/release/a5w6vw2pojcbpdqwr4pbljs4my b/tests/data/release/a5w6vw2pojcbpdqwr4pbljs4my
new file mode 100644
index 0000000..661e6d4
--- /dev/null
+++ b/tests/data/release/a5w6vw2pojcbpdqwr4pbljs4my
@@ -0,0 +1,33 @@
+{
+ "abstracts": [],
+ "contribs": [
+ {
+ "extra": {
+ "seq": "first"
+ },
+ "index": 0,
+ "raw_name": "Per Brinch Hansen",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.1007/978-1-4757-3472-0_11"
+ },
+ "extra": {
+ "container_name": "The Origin of Concurrent Programming",
+ "crossref": {
+ "type": "book-chapter"
+ }
+ },
+ "ident": "a5w6vw2pojcbpdqwr4pbljs4my",
+ "pages": "297-318",
+ "publisher": "Springer New York",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "chapter",
+ "release_year": 1975,
+ "revision": "390de494-9a92-48da-a455-a55894fb8542",
+ "state": "active",
+ "title": "The Programming Language Concurrent Pascal",
+ "work_id": "bjuhfso3prgopfpbvaxpv3rnky"
+}
diff --git a/tests/data/release/c3alu7csn5hyre6fuykhgby7za b/tests/data/release/c3alu7csn5hyre6fuykhgby7za
new file mode 100644
index 0000000..5844801
--- /dev/null
+++ b/tests/data/release/c3alu7csn5hyre6fuykhgby7za
@@ -0,0 +1,33 @@
+{
+ "abstracts": [],
+ "contribs": [
+ {
+ "extra": {
+ "seq": "first"
+ },
+ "index": 0,
+ "raw_name": "Per Brinch Hansen",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.1007/978-1-4612-6315-9_19"
+ },
+ "extra": {
+ "container_name": "Programming Methodology",
+ "crossref": {
+ "type": "book-chapter"
+ }
+ },
+ "ident": "c3alu7csn5hyre6fuykhgby7za",
+ "pages": "244-261",
+ "publisher": "Springer New York",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "chapter",
+ "release_year": 1978,
+ "revision": "bd41b695-83b5-4578-aa0f-9fb64127a244",
+ "state": "active",
+ "title": "The Programming Language Concurrent Pascal",
+ "work_id": "iji5kr4eajdn3fcln5eofhi3zm"
+}
diff --git a/tests/data/release/em7vs7m56fhzfh2onmqpihvmdy b/tests/data/release/em7vs7m56fhzfh2onmqpihvmdy
new file mode 100644
index 0000000..183dcfd
--- /dev/null
+++ b/tests/data/release/em7vs7m56fhzfh2onmqpihvmdy
@@ -0,0 +1,34 @@
+{
+ "abstracts": [],
+ "container_id": "dthyqga2onff5nwf4agwqtrhxi",
+ "contribs": [
+ {
+ "extra": {
+ "seq": "first"
+ },
+ "index": 0,
+ "raw_name": "Per Brinch Hansen",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.1109/tse.1975.6312840"
+ },
+ "extra": {
+ "crossref": {
+ "type": "journal-article"
+ }
+ },
+ "ident": "em7vs7m56fhzfh2onmqpihvmdy",
+ "pages": "199-207",
+ "publisher": "Institute of Electrical and Electronics Engineers (IEEE)",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 1975,
+ "revision": "cb529109-965d-436e-95db-c0d3aa11efc2",
+ "state": "active",
+ "title": "The programming language Concurrent Pascal",
+ "volume": "SE-1",
+ "work_id": "agze5ddqinhgzcs6pmigf6xk2i"
+}
diff --git a/tests/data/release/prxzzz63kventfyviovrgyw7bq b/tests/data/release/prxzzz63kventfyviovrgyw7bq
new file mode 100644
index 0000000..f868e0a
--- /dev/null
+++ b/tests/data/release/prxzzz63kventfyviovrgyw7bq
@@ -0,0 +1,33 @@
+{
+ "abstracts": [],
+ "container_id": "2w3awgokqne6te4nvlofavy5a4",
+ "contribs": [
+ {
+ "extra": {
+ "seq": "first"
+ },
+ "index": 0,
+ "raw_name": "Per Brinch Hansen",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.1007/3-540-07994-7_50"
+ },
+ "extra": {
+ "crossref": {
+ "type": "book-chapter"
+ }
+ },
+ "ident": "prxzzz63kventfyviovrgyw7bq",
+ "pages": "82-110",
+ "publisher": "Springer Berlin Heidelberg",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "chapter",
+ "release_year": 1976,
+ "revision": "e1220df2-c0bd-4748-a26b-48af38803349",
+ "state": "active",
+ "title": "The programming language concurrent pascal",
+ "work_id": "qmbmdbqt7jgdlphvkxfm6fuup4"
+}
diff --git a/tests/data/release/vpnnootnwjc35ovylfvyzjq3fq b/tests/data/release/vpnnootnwjc35ovylfvyzjq3fq
new file mode 100644
index 0000000..2ff2735
--- /dev/null
+++ b/tests/data/release/vpnnootnwjc35ovylfvyzjq3fq
@@ -0,0 +1,33 @@
+{
+ "abstracts": [],
+ "contribs": [
+ {
+ "extra": {
+ "seq": "first"
+ },
+ "index": 0,
+ "raw_name": "P. Brinch-Hansen",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.1007/978-3-662-09507-2_17"
+ },
+ "extra": {
+ "container_name": "Programming Languages",
+ "crossref": {
+ "type": "book-chapter"
+ }
+ },
+ "ident": "vpnnootnwjc35ovylfvyzjq3fq",
+ "pages": "264-272",
+ "publisher": "Springer Berlin Heidelberg",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "chapter",
+ "release_year": 1983,
+ "revision": "d090a9e6-d2c6-4df9-b25b-88d9e2c962be",
+ "state": "active",
+ "title": "The Programming Language Concurrent Pascal",
+ "work_id": "zpiapu3atzgg3ap235zlo7n3qi"
+}
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index 03a7047..a830eb8 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -306,3 +306,13 @@ r6fguusjobgythv5wnujnqkw2q,v43g5g7xjbd3lhcsnctfa7f3ue,Status.STRONG,PMID_DOI_PAI
kukfjxop6vdtlldynhm57sjpna,pgcuewadijepxllmccksczqwku,Status.DIFFERENT,CONTRIB_INTERSECTION_EMPTY
kukfjxop6vdtlldynhm57sjpna,zztbhjw4ljdadhuzraccgbqzl4,,
pgcuewadijepxllmccksczqwku,zztbhjw4ljdadhuzraccgbqzl4,Status.DIFFERENT,CONTRIB_INTERSECTION_EMPTY
+a5w6vw2pojcbpdqwr4pbljs4my,c3alu7csn5hyre6fuykhgby7za,Status.DIFFERENT,BOOK_CHAPTER
+a5w6vw2pojcbpdqwr4pbljs4my,em7vs7m56fhzfh2onmqpihvmdy,Status.EXACT,TITLE_AUTHOR_MATCH
+a5w6vw2pojcbpdqwr4pbljs4my,prxzzz63kventfyviovrgyw7bq,,
+a5w6vw2pojcbpdqwr4pbljs4my,vpnnootnwjc35ovylfvyzjq3fq,,
+c3alu7csn5hyre6fuykhgby7za,em7vs7m56fhzfh2onmqpihvmdy,,
+c3alu7csn5hyre6fuykhgby7za,prxzzz63kventfyviovrgyw7bq,,
+c3alu7csn5hyre6fuykhgby7za,vpnnootnwjc35ovylfvyzjq3fq,,
+em7vs7m56fhzfh2onmqpihvmdy,prxzzz63kventfyviovrgyw7bq,,
+em7vs7m56fhzfh2onmqpihvmdy,vpnnootnwjc35ovylfvyzjq3fq,,
+prxzzz63kventfyviovrgyw7bq,vpnnootnwjc35ovylfvyzjq3fq,,