aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/matching.py
diff options
context:
space:
mode:
Diffstat (limited to 'fuzzycat/matching.py')
-rw-r--r--fuzzycat/matching.py58
1 files changed, 52 insertions, 6 deletions
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index b01ce64..38899f9 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -45,7 +45,7 @@ class FuzzyReleaseMatcher:
...) this is and will be too slow.
Anecdata: An early 2020 test run matching 23M "title strings" took
- literally a couple of weeks to complete.
+ literally weeks to complete.
This class is currently tested against the live fatcat search instance. A
usage example:
@@ -63,7 +63,8 @@ class FuzzyReleaseMatcher:
index="fatcat_release",
size=10,
min_token_length=3,
- release_year_padding=1):
+ release_year_padding=1,
+ skip_id_matching=False):
if isinstance(es, str):
self.es = elasticsearch.Elasticsearch([es])
else:
@@ -74,6 +75,7 @@ class FuzzyReleaseMatcher:
self.logger = logging.getLogger("fuzzy")
self.min_token_length = min_token_length
self.release_year_padding = 1
+ self.skip_id_matching = skip_id_matching
def _match_id(self, release: Optional[ReleaseEntity]) -> List[ReleaseEntity]:
"""
@@ -114,7 +116,10 @@ class FuzzyReleaseMatcher:
"""
Match in the presence of defined title and contrib fields.
"""
- contrib_tokens = [tok for c in release.contribs for tok in c.raw_name.split()]
+ contrib_tokens = [
+ tok for c in release.contribs for tok in c.raw_name.split()
+ if len(tok) > self.min_token_length
+ ]
contrib_queries = [{
"match": {
"contrib_names": {
@@ -124,6 +129,11 @@ class FuzzyReleaseMatcher:
} for token in contrib_tokens]
query = {
"bool": {
+ "must_not": [{
+ "match": {
+ "release_type": "stub",
+ },
+ }],
"must": [
{
"match": {
@@ -137,6 +147,9 @@ class FuzzyReleaseMatcher:
] + contrib_queries,
},
}
+ # TODO: could boost on various things, like "overall metadata quality"
+ # (eg, does indexed record have title+year+release_type+container), or
+ # on publication stage (assuming things getting cited are 'published')
if release.release_year is not None:
query["bool"]["must"].append({
"range": {
@@ -149,6 +162,7 @@ class FuzzyReleaseMatcher:
})
result = []
self.logger.info(query)
+ # TODO: can we use the container name
resp = self.es.search(index=self.index,
body={
"query": query,
@@ -171,6 +185,11 @@ class FuzzyReleaseMatcher:
"""
query = {
"bool": {
+ "must_not": [{
+ "match": {
+ "release_type": "stub",
+ },
+ }],
"must": [
{
"match": {
@@ -225,6 +244,11 @@ class FuzzyReleaseMatcher:
} for token in contrib_tokens]
query = {
"bool": {
+ "must_not": [{
+ "match": {
+ "release_type": "stub",
+ },
+ }],
"must": contrib_queries,
},
}
@@ -270,6 +294,11 @@ class FuzzyReleaseMatcher:
]
query = {
"bool": {
+ "must_not": [{
+ "match": {
+ "release_type": "stub",
+ },
+ }],
"must": token_queries,
},
}
@@ -307,10 +336,11 @@ class FuzzyReleaseMatcher:
document.
"""
if not release:
- return []
- if release.ext_ids and len(release.ext_ids.to_dict()) > 0:
+ result = []
+ elif not self.skip_id_matching and release.ext_ids and any(
+ release.ext_ids.to_dict().values()):
result = self._match_id(release)
- if release.title is not None and release.contribs is not None:
+ elif release.title is not None and release.contribs is not None:
result = self._match_title_contrib(release)
elif release.title is not None:
result = self._match_title(release)
@@ -332,6 +362,22 @@ def public_api(host_uri):
conf.host = host_uri
return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf))
+def release_contrib_tokens(release : ReleaseEntity) -> List[str]:
+ """
+ Return contribs as a list of tokens.
+ """
+ # TODO! fix this
+ tokens = []
+ for c in release.contribs:
+ if c.surname is not None:
+ tokens += c.surname.split()
+ elif c.raw_name is not None:
+ tokens += c.surname.split()
+ contrib_tokens = [
+ tok for c in release.contribs for tok in c.raw_name.split()
+ ]
+ return contrib_tokens
+
def release_tokens(release: ReleaseEntity) -> List[str]:
"""