diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-16 19:56:01 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-16 20:16:09 -0800 |
commit | 6d5811693c36b9e73dedf0205c40f2aed63e2870 (patch) | |
tree | 717de06d66ac009205a91cdeb511d113d61eac85 | |
parent | 38328c25674fee7781a8d8601e1d47de04186f19 (diff) | |
download | fatcat-6d5811693c36b9e73dedf0205c40f2aed63e2870.tar.gz fatcat-6d5811693c36b9e73dedf0205c40f2aed63e2870.zip |
add fuzzy match filtering to DOAJ importer
In this default configuration, any entities with a fuzzy match (even
"ambiguous") will be skipped at import time, to prevent creating
duplicates. This is conservative towards not creating new/duplicate
entities.
In the future, as we get more confidence in fuzzy match/verification, we
can start to ignore AMBIGUOUS, handle EXACT as same release, and merge
STRONG (and WEAK?) matches under the same work entity.
-rw-r--r-- | python/fatcat_tools/importers/doaj_article.py | 11 | ||||
-rw-r--r-- | python/tests/import_doaj.py | 16 |
2 files changed, 23 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 03752484..191a65d8 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -217,9 +217,16 @@ class DoajArticleImporter(EntityImporter): return False break - # TODO: in the future could do fuzzy match here, eg using elasticsearch + if not existing and self.do_fuzzy_match: + fuzzy_result = self.match_existing_release_fuzzy(re) + # TODO: in the future, could assign work_id for clustering, or for + # "EXACT" match, set existing and allow (optional) update code path + # to run + if fuzzy_result is not None: + self.counts["exists-fuzzy"] += 1 + return False - # create entity + # if no fuzzy existing match, create entity if not existing: return True diff --git a/python/tests/import_doaj.py b/python/tests/import_doaj.py index d69aebd7..17a23257 100644 --- a/python/tests/import_doaj.py +++ b/python/tests/import_doaj.py @@ -3,6 +3,7 @@ import json import datetime import pytest +import elasticsearch import fatcat_openapi_client from fatcat_tools.importers import DoajArticleImporter, JsonLinePusher @@ -11,14 +12,22 @@ from fixtures import * @pytest.fixture(scope="function") -def doaj_importer(api): +def doaj_importer(api, mocker): + es_client = elasticsearch.Elasticsearch("mockbackend") + mocker.patch('elasticsearch.connection.Urllib3HttpConnection.perform_request') with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: - yield DoajArticleImporter(api, issn_file, bezerk_mode=True) + yield DoajArticleImporter( + api, + issn_file, + bezerk_mode=True, + es_client=es_client, + ) def test_doaj_importer(doaj_importer): last_index = doaj_importer.api.get_changelog(limit=1)[0].index with open("tests/files/example_doaj_articles.json", "r") as f: doaj_importer.bezerk_mode = True + doaj_importer.do_fuzzy_match = False counts = JsonLinePusher(doaj_importer, f).run() assert counts["insert"] == 5 assert counts["exists"] == 0 @@ -60,6 +69,7 @@ def test_doaj_importer_existing_doi(doaj_importer): doaj_importer.reset() doaj_importer.bezerk_mode = False doaj_importer.do_updates = False + doaj_importer.do_fuzzy_match = False counts = JsonLinePusher(doaj_importer, f).run() print(counts) assert counts["insert"] == 4 @@ -72,6 +82,7 @@ def test_doaj_importer_existing_doi(doaj_importer): doaj_importer.reset() doaj_importer.bezerk_mode = False doaj_importer.do_updates = True + doaj_importer.do_fuzzy_match = False counts = JsonLinePusher(doaj_importer, f).run() print(counts) assert counts["insert"] == 0 @@ -84,6 +95,7 @@ def test_doaj_importer_existing_doi(doaj_importer): doaj_importer.reset() doaj_importer.bezerk_mode = False doaj_importer.do_updates = True + doaj_importer.do_fuzzy_match = False counts = JsonLinePusher(doaj_importer, f).run() print(counts) assert counts["insert"] == 0 |