aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-12-16 19:56:01 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-12-16 20:16:09 -0800
commit6d5811693c36b9e73dedf0205c40f2aed63e2870 (patch)
tree717de06d66ac009205a91cdeb511d113d61eac85
parent38328c25674fee7781a8d8601e1d47de04186f19 (diff)
downloadfatcat-6d5811693c36b9e73dedf0205c40f2aed63e2870.tar.gz
fatcat-6d5811693c36b9e73dedf0205c40f2aed63e2870.zip
add fuzzy match filtering to DOAJ importer
In this default configuration, any entities with a fuzzy match (even "ambiguous") will be skipped at import time, to prevent creating duplicates. This is conservative towards not creating new/duplicate entities. In the future, as we get more confidence in fuzzy match/verification, we can start to ignore AMBIGUOUS, handle EXACT as same release, and merge STRONG (and WEAK?) matches under the same work entity.
-rw-r--r--python/fatcat_tools/importers/doaj_article.py11
-rw-r--r--python/tests/import_doaj.py16
2 files changed, 23 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index 03752484..191a65d8 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -217,9 +217,16 @@ class DoajArticleImporter(EntityImporter):
return False
break
- # TODO: in the future could do fuzzy match here, eg using elasticsearch
+ if not existing and self.do_fuzzy_match:
+ fuzzy_result = self.match_existing_release_fuzzy(re)
+ # TODO: in the future, could assign work_id for clustering, or for
+ # "EXACT" match, set existing and allow (optional) update code path
+ # to run
+ if fuzzy_result is not None:
+ self.counts["exists-fuzzy"] += 1
+ return False
- # create entity
+ # if no fuzzy existing match, create entity
if not existing:
return True
diff --git a/python/tests/import_doaj.py b/python/tests/import_doaj.py
index d69aebd7..17a23257 100644
--- a/python/tests/import_doaj.py
+++ b/python/tests/import_doaj.py
@@ -3,6 +3,7 @@ import json
import datetime
import pytest
+import elasticsearch
import fatcat_openapi_client
from fatcat_tools.importers import DoajArticleImporter, JsonLinePusher
@@ -11,14 +12,22 @@ from fixtures import *
@pytest.fixture(scope="function")
-def doaj_importer(api):
+def doaj_importer(api, mocker):
+ es_client = elasticsearch.Elasticsearch("mockbackend")
+ mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request')
with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file:
- yield DoajArticleImporter(api, issn_file, bezerk_mode=True)
+ yield DoajArticleImporter(
+ api,
+ issn_file,
+ bezerk_mode=True,
+ es_client=es_client,
+ )
def test_doaj_importer(doaj_importer):
last_index = doaj_importer.api.get_changelog(limit=1)[0].index
with open("tests/files/example_doaj_articles.json", "r") as f:
doaj_importer.bezerk_mode = True
+ doaj_importer.do_fuzzy_match = False
counts = JsonLinePusher(doaj_importer, f).run()
assert counts["insert"] == 5
assert counts["exists"] == 0
@@ -60,6 +69,7 @@ def test_doaj_importer_existing_doi(doaj_importer):
doaj_importer.reset()
doaj_importer.bezerk_mode = False
doaj_importer.do_updates = False
+ doaj_importer.do_fuzzy_match = False
counts = JsonLinePusher(doaj_importer, f).run()
print(counts)
assert counts["insert"] == 4
@@ -72,6 +82,7 @@ def test_doaj_importer_existing_doi(doaj_importer):
doaj_importer.reset()
doaj_importer.bezerk_mode = False
doaj_importer.do_updates = True
+ doaj_importer.do_fuzzy_match = False
counts = JsonLinePusher(doaj_importer, f).run()
print(counts)
assert counts["insert"] == 0
@@ -84,6 +95,7 @@ def test_doaj_importer_existing_doi(doaj_importer):
doaj_importer.reset()
doaj_importer.bezerk_mode = False
doaj_importer.do_updates = True
+ doaj_importer.do_fuzzy_match = False
counts = JsonLinePusher(doaj_importer, f).run()
print(counts)
assert counts["insert"] == 0