diff options
-rw-r--r-- | fuzzycat/common.py | 1 | ||||
-rw-r--r-- | fuzzycat/verify.py | 10 | ||||
-rw-r--r-- | notes/2020_11_testruns.md | 5 | ||||
-rw-r--r-- | tests/data/release/s7a4o5v5gfg4tbzna6poyg7nzy | 23 | ||||
-rw-r--r-- | tests/data/release/tcro5wr6brhqnf5wettyiauw34 | 20 | ||||
-rw-r--r-- | tests/data/verify.csv | 1 | ||||
-rw-r--r-- | tests/test_verify.py | 4 |
7 files changed, 62 insertions, 2 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py index c50962b..2257e9d 100644 --- a/fuzzycat/common.py +++ b/fuzzycat/common.py @@ -17,6 +17,7 @@ class OK(str, Enum): Reason for assuming we have a match. """ ARXIV_VERSION = 'ok.arxiv_version' + CUSTOM_BSI_SUBDOC = 'ok.custom_bsi_subdoc' CUSTOM_BSI_UNDATED = 'ok.custom_bsi_undated' CUSTOM_IEEE_ARXIV = 'ok.custom_ieee_arxiv' DATACITE_RELATED_ID = 'ok.datacite_related_id' diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 6fa4100..4daed97 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -76,7 +76,7 @@ from glom import PathAccessError, glom from fuzzycat.common import OK, Miss, Status from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, has_doi_prefix, - num_project, slugify_string, jaccard) + jaccard, num_project, slugify_string) # The result of clustering are documents that have a key k and a list of values # (of the cluster) v. @@ -211,6 +211,14 @@ def compare(a, b): if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"): if a_doi + "u" == b_doi or b_doi + "u" == a_doi: return (Status.STRONG, OK.CUSTOM_BSI_UNDATED) + # Reference to subdocument. + # https://api.fatcat.wiki/v0/release/tcro5wr6brhqnf5wettyiauw34 + # https://api.fatcat.wiki/v0/release/s7a4o5v5gfg4tbzna6poyg7nzy + if a_title == b_title and ((dict_key_exists(a, "extra.subtitle") + and not dict_key_exists(b, "extra.subtitle")) or + (dict_key_exists(b, "extra.subtitle") + and not dict_key_exists(a, "extra.subtitle"))): + return (Status.STRONG, OK.CUSTOM_BSI_SUBDOC) except PathAccessError: pass diff --git a/notes/2020_11_testruns.md b/notes/2020_11_testruns.md index 10fa426..e801df3 100644 --- a/notes/2020_11_testruns.md +++ b/notes/2020_11_testruns.md @@ -179,7 +179,12 @@ Ambiguous. * [ ] https://fatcat.wiki/release/b3uhit7b4vhvliocdzwxr7peyy https://fatcat.wiki/release/zwru5ugcsfcyzeuqlygfw46vwq Status.AMBIGUOUS OK.DUMMY +A difficult prefix. + * [ ] https://fatcat.wiki/release/s7a4o5v5gfg4tbzna6poyg7nzy https://fatcat.wiki/release/tcro5wr6brhqnf5wettyiauw34 Status.AMBIGUOUS OK.DUMMY + +BSI, one is a subdocument of another. The subdocument has a subtitle. That's more is-part-of. + * [ ] https://fatcat.wiki/release/b3odcrpuwveqljszl7l4c4ah5e https://fatcat.wiki/release/j4drpogsbzfd7esim4um5me374 Status.AMBIGUOUS OK.DUMMY * [ ] https://fatcat.wiki/release/4owywaqwwvhrthytdjbulmltue https://fatcat.wiki/release/cy2v4wpm7nfptexi6ybjylapee Status.AMBIGUOUS OK.DUMMY * [ ] https://fatcat.wiki/release/5ba6m23tmvcwdcpq3a2rcof4du https://fatcat.wiki/release/wczxyrrmovf3te5ziep7bbdreu Status.AMBIGUOUS OK.DUMMY diff --git a/tests/data/release/s7a4o5v5gfg4tbzna6poyg7nzy b/tests/data/release/s7a4o5v5gfg4tbzna6poyg7nzy new file mode 100644 index 0000000..02169b3 --- /dev/null +++ b/tests/data/release/s7a4o5v5gfg4tbzna6poyg7nzy @@ -0,0 +1,23 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.3403/02279101u" + }, + "extra": { + "crossref": { + "type": "standard" + }, + "subtitle": [ + "Fixed pipette method" + ] + }, + "ident": "s7a4o5v5gfg4tbzna6poyg7nzy", + "publisher": "BSI British Standards", + "refs": [], + "release_type": "standard", + "revision": "9ae088bf-5480-4a5b-b68d-356096d28d16", + "state": "active", + "title": "Determination of particle size distribution by gravitational liquid sedimentation methods", + "work_id": "chjktubyxbdpviowq6p4agfh5y" +} diff --git a/tests/data/release/tcro5wr6brhqnf5wettyiauw34 b/tests/data/release/tcro5wr6brhqnf5wettyiauw34 new file mode 100644 index 0000000..2247102 --- /dev/null +++ b/tests/data/release/tcro5wr6brhqnf5wettyiauw34 @@ -0,0 +1,20 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.3403/bsiso13317" + }, + "extra": { + "crossref": { + "type": "standard" + } + }, + "ident": "tcro5wr6brhqnf5wettyiauw34", + "publisher": "BSI British Standards", + "refs": [], + "release_type": "standard", + "revision": "23fae426-a22e-4226-8400-f085d7eb0c56", + "state": "active", + "title": "Determination of particle size distribution by gravitational liquid sedimentation methods", + "work_id": "hh6o7mm3ivblng4t2dhuhzocwm" +} diff --git a/tests/data/verify.csv b/tests/data/verify.csv index 389762c..d3c8dbe 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -137,3 +137,4 @@ voruupqxhvggfex4zlczcmjxxu,jg72qhdvmncfdfxg5l47hw3uba,Status.AMBIGUOUS,OK.DUMMY 6ysfa7ncx5fldmvmwvjgpf2i6e,yp3rs3xb5ra2riyx5xayrlqfum,Status.EXACT,OK.WORK_ID arqtphat7fashokettncepu7xe,v6p7xct6kfgwtdbh57zfjqmuua,Status.AMBIGUOUS, zwru5ugcsfcyzeuqlygfw46vwq,b3uhit7b4vhvliocdzwxr7peyy,Status.AMBIGUOUS,Miss.CUSTOM_PREFIX_10_7916 +s7a4o5v5gfg4tbzna6poyg7nzy,tcro5wr6brhqnf5wettyiauw34,Status.STRONG,OK.CUSTOM_BSI_SUBDOC diff --git a/tests/test_verify.py b/tests/test_verify.py index 5a60326..e3e05ad 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -38,7 +38,9 @@ def test_compare(): try: a, b, expected_status, expected_reason = row except ValueError as exc: - pytest.fail("invalid test file, maybe too many (or few) commas in row {}? {}".format(i + 1, exc)) + pytest.fail( + "invalid test file, maybe too many (or few) commas in row {}? {}".format( + i + 1, exc)) status, reason = compare(load_release_ident(a), load_release_ident(b)) if not expected_status or expected_status.lower() == "todo": logger.warning( |