aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-12-17 10:07:15 +0100
committerMartin Czygan <martin.czygan@gmail.com>2021-12-21 20:56:56 +0100
commitde9f1155ea57c812171abd5517ab39f4fe135cb3 (patch)
tree2b2071642259c46ede5b56d15cbce15187226362
parent4720fb51584fae1edc2a79dd94c24b4ddac92acb (diff)
downloadfuzzycat-master.tar.gz
fuzzycat-master.zip
apply first round of feedback on matchingHEADmaster
-rw-r--r--fuzzycat/matching.py58
-rw-r--r--fuzzycat/utils.py7
-rw-r--r--tests/files/fuzzy_release_matcher/0.yaml1
-rw-r--r--tests/files/fuzzy_release_matcher/1.yaml1
-rw-r--r--tests/files/fuzzy_release_matcher/2.yaml1
-rw-r--r--tests/files/fuzzy_release_matcher/3.yaml1
-rw-r--r--tests/files/fuzzy_release_matcher/4.yaml1
-rw-r--r--tests/files/fuzzy_release_matcher/5.yaml1
-rw-r--r--tests/files/fuzzy_release_matcher/6.yaml1
-rw-r--r--tests/files/fuzzy_release_matcher/7.yaml1
-rw-r--r--tests/files/fuzzy_release_matcher/8.yaml1
-rw-r--r--tests/files/fuzzy_release_matcher/9.yaml1
-rw-r--r--tests/test_matching.py7
13 files changed, 73 insertions, 9 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index b01ce64..38899f9 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -45,7 +45,7 @@ class FuzzyReleaseMatcher:
...) this is and will be too slow.
Anecdata: An early 2020 test run matching 23M "title strings" took
- literally a couple of weeks to complete.
+ literally weeks to complete.
This class is currently tested against the live fatcat search instance. A
usage example:
@@ -63,7 +63,8 @@ class FuzzyReleaseMatcher:
index="fatcat_release",
size=10,
min_token_length=3,
- release_year_padding=1):
+ release_year_padding=1,
+ skip_id_matching=False):
if isinstance(es, str):
self.es = elasticsearch.Elasticsearch([es])
else:
@@ -74,6 +75,7 @@ class FuzzyReleaseMatcher:
self.logger = logging.getLogger("fuzzy")
self.min_token_length = min_token_length
self.release_year_padding = 1
+ self.skip_id_matching = skip_id_matching
def _match_id(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:
"""
@@ -114,7 +116,10 @@ class FuzzyReleaseMatcher:
"""
Match in the presence of defined title and contrib fields.
"""
- contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]
+ contrib_tokens = [
+ tok for c in release.contribs for tok in c.raw_name.split()
+ if len(tok) > self.min_token_length
+ ]
contrib_queries = [{
"match": {
"contrib_names": {
@@ -124,6 +129,11 @@ class FuzzyReleaseMatcher:
} for token in contrib_tokens]
query = {
"bool": {
+ "must_not": [{
+ "match": {
+ "release_type": "stub",
+ },
+ }],
"must": [
{
"match": {
@@ -137,6 +147,9 @@ class FuzzyReleaseMatcher:
] + contrib_queries,
},
}
+ # TODO: could boost on various things, like "overall metadata quality"
+ # (eg, does indexed record have title+year+release_type+container), or
+ # on publication stage (assuming things getting cited are 'published')
if release.release_year is not None:
query["bool"]["must"].append({
"range": {
@@ -149,6 +162,7 @@ class FuzzyReleaseMatcher:
})
result = []
self.logger.info(query)
+ # TODO: can we use the container name
resp = self.es.search(index=self.index,
body={
"query": query,
@@ -171,6 +185,11 @@ class FuzzyReleaseMatcher:
"""
query = {
"bool": {
+ "must_not": [{
+ "match": {
+ "release_type": "stub",
+ },
+ }],
"must": [
{
"match": {
@@ -225,6 +244,11 @@ class FuzzyReleaseMatcher:
} for token in contrib_tokens]
query = {
"bool": {
+ "must_not": [{
+ "match": {
+ "release_type": "stub",
+ },
+ }],
"must": contrib_queries,
},
}
@@ -270,6 +294,11 @@ class FuzzyReleaseMatcher:
]
query = {
"bool": {
+ "must_not": [{
+ "match": {
+ "release_type": "stub",
+ },
+ }],
"must": token_queries,
},
}
@@ -307,10 +336,11 @@ class FuzzyReleaseMatcher:
document.
"""
if not release:
- return []
- if release.ext_ids and len(release.ext_ids.to_dict()) > 0:
+ result = []
+ elif not self.skip_id_matching and release.ext_ids and any(
+ release.ext_ids.to_dict().values()):
result = self._match_id(release)
- if release.title is not None and release.contribs is not None:
+ elif release.title is not None and release.contribs is not None:
result = self._match_title_contrib(release)
elif release.title is not None:
result = self._match_title(release)
@@ -332,6 +362,22 @@ def public_api(host_uri):
conf.host = host_uri
return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf))
+def release_contrib_tokens(release : ReleaseEntity) -> List[str]:
+ """
+ Return contribs as a list of tokens.
+ """
+ # TODO! fix this
+ tokens = []
+ for c in release.contribs:
+ if c.surname is not None:
+ tokens += c.surname.split()
+ elif c.raw_name is not None:
+ tokens += c.surname.split()
+ contrib_tokens = [
+ tok for c in release.contribs for tok in c.raw_name.split()
+ ]
+ return contrib_tokens
+
def release_tokens(release: ReleaseEntity) -> List[str]:
"""
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 24e103a..dadfa5c 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -30,6 +30,13 @@ def es_compat_hits_total(resp):
"""
try:
return resp["hits"]["total"]["value"] # ES7
+ except KeyError:
+ # with track_total_hits set to False, we observed missing "total" keys,
+ # es returns: {'_shards': {'failed': 0, 'skipped': 0, 'successful': 6,
+ # 'total': 6}, 'hits': {'hits': [{'_id': 'yvqtz2zvkzcbpj4jxrp7b...ons':
+ # [], 'any_abstract': False, 'ark_id': None, ...}, ...}],
+ # 'max_score': 108.32384}, 'timed_out': False, 'took': 921}
+ return len(resp["hits"]["hits"])
except TypeError:
return resp["hits"]["total"] # ES6
diff --git a/tests/files/fuzzy_release_matcher/0.yaml b/tests/files/fuzzy_release_matcher/0.yaml
index 71fc992..3c0b915 100644
--- a/tests/files/fuzzy_release_matcher/0.yaml
+++ b/tests/files/fuzzy_release_matcher/0.yaml
@@ -10,6 +10,7 @@ input: >
"ext_ids": {}
}
release_year_padding: 1
+skip_id_matching: false
expected:
- 7rmvqtrb2jdyhcxxodihzzcugy
- upm5nljirrbsfenoyxsisciltq
diff --git a/tests/files/fuzzy_release_matcher/1.yaml b/tests/files/fuzzy_release_matcher/1.yaml
index df6a954..115111b 100644
--- a/tests/files/fuzzy_release_matcher/1.yaml
+++ b/tests/files/fuzzy_release_matcher/1.yaml
@@ -10,6 +10,7 @@ input: >
"ext_ids": {}
}
release_year_padding: 1
+skip_id_matching: false
expected:
- 7rmvqtrb2jdyhcxxodihzzcugy
- a2u6ougtsjcbvczou6sazsulcm
diff --git a/tests/files/fuzzy_release_matcher/2.yaml b/tests/files/fuzzy_release_matcher/2.yaml
index df6a954..115111b 100644
--- a/tests/files/fuzzy_release_matcher/2.yaml
+++ b/tests/files/fuzzy_release_matcher/2.yaml
@@ -10,6 +10,7 @@ input: >
"ext_ids": {}
}
release_year_padding: 1
+skip_id_matching: false
expected:
- 7rmvqtrb2jdyhcxxodihzzcugy
- a2u6ougtsjcbvczou6sazsulcm
diff --git a/tests/files/fuzzy_release_matcher/3.yaml b/tests/files/fuzzy_release_matcher/3.yaml
index 1ab761b..ed56d5a 100644
--- a/tests/files/fuzzy_release_matcher/3.yaml
+++ b/tests/files/fuzzy_release_matcher/3.yaml
@@ -5,6 +5,7 @@ input: >
"ext_ids": {}
}
release_year_padding: 0
+skip_id_matching: false
expected:
- '2f57funqizf4lcxjanls45upom'
- '3p2hngx6kfa33bdaobipimdzhe'
diff --git a/tests/files/fuzzy_release_matcher/4.yaml b/tests/files/fuzzy_release_matcher/4.yaml
index 9419406..899772b 100644
--- a/tests/files/fuzzy_release_matcher/4.yaml
+++ b/tests/files/fuzzy_release_matcher/4.yaml
@@ -6,6 +6,7 @@ input: >
"ext_ids": {}
}
release_year_padding: 0
+skip_id_matching: false
expected:
- '66r4s55dpvht5jghwkhupai2km'
- 'ccoocm7uzjgwnlpfk5fbwfudjm'
diff --git a/tests/files/fuzzy_release_matcher/5.yaml b/tests/files/fuzzy_release_matcher/5.yaml
index 1eb435b..d8f208a 100644
--- a/tests/files/fuzzy_release_matcher/5.yaml
+++ b/tests/files/fuzzy_release_matcher/5.yaml
@@ -10,6 +10,7 @@ input: >
"ext_ids": {}
}
release_year_padding: 1
+skip_id_matching: false
expected:
- 'xfhjsixnlvbibigrilisqqvfk4'
- 'zfhfpo2shrdexpgd2as4fz7wnm'
diff --git a/tests/files/fuzzy_release_matcher/6.yaml b/tests/files/fuzzy_release_matcher/6.yaml
index ae52b23..7841b68 100644
--- a/tests/files/fuzzy_release_matcher/6.yaml
+++ b/tests/files/fuzzy_release_matcher/6.yaml
@@ -10,6 +10,7 @@ input: >
"ext_ids": {}
}
release_year_padding: 0
+skip_id_matching: false
expected:
- 2bbtr4cltbgannqc6vqijvvzdq
- 34i2hba6tzf3xomobhumfkkvga
diff --git a/tests/files/fuzzy_release_matcher/7.yaml b/tests/files/fuzzy_release_matcher/7.yaml
index 2330f0d..7affb8f 100644
--- a/tests/files/fuzzy_release_matcher/7.yaml
+++ b/tests/files/fuzzy_release_matcher/7.yaml
@@ -5,6 +5,7 @@ input: >
"ext_ids": {}
}
release_year_padding: 1
+skip_id_matching: false
expected:
- yvqtz2zvkzcbpj4jxrp7bvydfu
- lttg27o7mjganpkhrgy3xyv7vu
diff --git a/tests/files/fuzzy_release_matcher/8.yaml b/tests/files/fuzzy_release_matcher/8.yaml
index b43e53a..271d1a4 100644
--- a/tests/files/fuzzy_release_matcher/8.yaml
+++ b/tests/files/fuzzy_release_matcher/8.yaml
@@ -134,6 +134,7 @@ input: >
}
}
release_year_padding: 1
+skip_id_matching: true
expected:
- yvqtz2zvkzcbpj4jxrp7bvydfu
- lttg27o7mjganpkhrgy3xyv7vu
diff --git a/tests/files/fuzzy_release_matcher/9.yaml b/tests/files/fuzzy_release_matcher/9.yaml
index b43e53a..271d1a4 100644
--- a/tests/files/fuzzy_release_matcher/9.yaml
+++ b/tests/files/fuzzy_release_matcher/9.yaml
@@ -134,6 +134,7 @@ input: >
}
}
release_year_padding: 1
+skip_id_matching: true
expected:
- yvqtz2zvkzcbpj4jxrp7bvydfu
- lttg27o7mjganpkhrgy3xyv7vu
diff --git a/tests/test_matching.py b/tests/test_matching.py
index eb54751..a8f8f5b 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -29,8 +29,7 @@ logger.setLevel(logging.DEBUG)
FATCAT_SEARCH_URL = settings.get("FATCAT_SEARCH_URL", "https://search.fatcat.wiki:443")
-def yaml_to_cases(klass,
- files="tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml"):
+def yaml_to_cases(klass, files="tests/files/fuzzy_release_matcher/*.yaml"):
"""
Turn yaml files into a collection of named tuple test cases. The glob is
relative to the project root (i.e. where you usually run `pytest` from).
@@ -54,10 +53,12 @@ def test_simple_fuzzy_release_matcher(es_client, caplog):
the result to be sensible, but should also document broken examples here.
"""
matcher = FuzzyReleaseMatcher(es=es_client)
- Case = collections.namedtuple("Case", "about input release_year_padding expected")
+ Case = collections.namedtuple(
+ "Case", ["about", "input", "skip_id_matching", "release_year_padding", "expected"])
cases = yaml_to_cases(Case, "tests/files/fuzzy_release_matcher/*.yaml")
for i, c in enumerate(cases):
matcher.release_year_padding = c.release_year_padding
+ matcher.skip_id_matching = c.skip_id_matching
entity = entity_from_json(c.input, ReleaseEntity)
result = matcher.match(entity)
assert set([r.ident for r in result]) == set(c.expected), "[{}] {}".format(c.about, c.input)