From 4cdce8ce95df74c706394bbcd341283e4dee0525 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 15 Jun 2021 18:12:41 +0200 Subject: notes: duplicates --- python/notes/version_4.md | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'python/notes/version_4.md') diff --git a/python/notes/version_4.md b/python/notes/version_4.md index 10533fd..e504b2a 100644 --- a/python/notes/version_4.md +++ b/python/notes/version_4.md @@ -767,3 +767,57 @@ refs (2017 only) ``` + +## Duplicates in combined dataset + +When we merge matches with refs, we find duplicates, e.g.: + +``` +{ + "_id": "4kg2dejsgzaf3cszs2lt5hz4by_9", + "indexed_ts": "2021-06-15T15:30:42Z", + "source_release_ident": "4kg2dejsgzaf3cszs2lt5hz4by", + "source_work_ident": "2222jduvonfg3p2no5gvvf2sj4", + "source_year": "2011", + "ref_index": 9, + "ref_key": "ref9", + "target_release_ident": "itntzdjbczfmhhaynqvqcwp6wm", + "target_work_ident": "csff4o7yjzbz3mfszl4zvfkcua", + "match_provenance": "crossref", + "match_status": "exact", + "match_reason": "doi" +} +{ + "_id": "4kg2dejsgzaf3cszs2lt5hz4by_9", + "indexed_ts": "2021-06-15T15:30:42Z", + "source_release_ident": "4kg2dejsgzaf3cszs2lt5hz4by", + "source_work_ident": "2222jduvonfg3p2no5gvvf2sj4", + "source_year": "2011", + "ref_index": 9, + "ref_key": "b9", + "match_status": "unmatched", + "match_reason": "unknown", + "target_unstructured": "Danaceau JP, Deering CE, Day JE, Smeal SJ, Johnson-Davis KL, et al. (2007) Persistence of tolerance to methamphetamine-induced monoamine deficits. Eur J Pharmacol 559: 46-54." +} +{ + "_id": "4kg2dejsgzaf3cszs2lt5hz4by_9", + "indexed_ts": "2021-06-15T15:30:42Z", + "source_release_ident": "4kg2dejsgzaf3cszs2lt5hz4by", + "source_work_ident": "2222jduvonfg3p2no5gvvf2sj4", + "source_year": "2011", + "ref_index": 9, + "ref_key": "ref10", + "match_status": "unmatched", + "match_reason": "unknown" +} +``` + +Here, the ref index is 9, but ref keys are different, which might come from a +different grobid run. I feel, we should not depend on a value that we have +little control over. + +As a mititgation, we'll run a final deduplication step; but that won't catch +all duplicates, e.g. when the indices are different, but the reference is +actually the same. + +Would need to "uniq" tool for the whole ref blob or something like that. -- cgit v1.2.3