diff options
| author | Martin Czygan <martin@archive.org> | 2021-04-15 14:11:09 +0000 | 
|---|---|---|
| committer | Martin Czygan <martin@archive.org> | 2021-04-15 14:11:09 +0000 | 
| commit | b27c43071ab021e9595457999359009cfd7a1abb (patch) | |
| tree | e00199889528c00f777f5bbc908d0962760fb96f /fuzzycat | |
| parent | 8a17311c9516e63aeb31111647fdf21083bcf928 (diff) | |
| parent | d44a9e421edfec2cac16048b67e6809cae8cdd18 (diff) | |
| download | fuzzycat-b27c43071ab021e9595457999359009cfd7a1abb.tar.gz fuzzycat-b27c43071ab021e9595457999359009cfd7a1abb.zip | |
Merge branch 'bnewbold-upstreaming' into 'master'
refactoring/upstreaming fuzzycat "live" matching helpers
See merge request webgroup/fuzzycat!2
Diffstat (limited to 'fuzzycat')
| -rw-r--r-- | fuzzycat/__main__.py | 39 | ||||
| -rwxr-xr-x | fuzzycat/grobid2json.py | 213 | ||||
| -rw-r--r-- | fuzzycat/grobid_unstructured.py | 126 | ||||
| -rw-r--r-- | fuzzycat/simple.py | 274 | 
4 files changed, 651 insertions, 1 deletions
| diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py index 9121bd8..43691e8 100644 --- a/fuzzycat/__main__.py +++ b/fuzzycat/__main__.py @@ -8,6 +8,7 @@ COMMANDS      verify_single      verify_ref      release_match +    unstructured    Run, e.g. fuzzycat cluster --help for more options. @@ -45,7 +46,7 @@ EXAMPLES    Release match (non-bulk). -      $ python -m fuzzycat release_match -q "hello world" +      $ python -m fuzzycat release_match --value "hello world"        TODO: Elasticsearch might not respond to POST queries (which is what the        client library uses, see: https://git.io/JLssk). @@ -63,6 +64,7 @@ import random  import sys  import tempfile +import elasticsearch  import requests  from fatcat_openapi_client import ReleaseEntity @@ -70,8 +72,10 @@ from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngra                                release_key_title_normalized, release_key_title_nysiis,                                release_key_title_sandcrawler)  from fuzzycat.entities import entity_to_dict +from fuzzycat.grobid_unstructured import grobid_parse_unstructured  from fuzzycat.matching import anything_to_entity, match_release_fuzzy  from fuzzycat.refs import RefsGroupVerifier +from fuzzycat.simple import closest_fuzzy_release_match  from fuzzycat.utils import random_idents_from_query, random_word  from fuzzycat.verify import GroupVerifier, verify @@ -200,6 +204,29 @@ def run_release_match(args):              print(json.dumps(vs)) +def run_unstructured(args): +    """ +    Given a raw citation string, parse it and find "closest" match. + +    Uses lower-level routines instead of simple.closest_fuzzy_unstructured_match(raw_citation) +    """ +    es_client = elasticsearch.Elasticsearch(args.es_url) + +    print("## Sending to GROBID...", file=sys.stderr) +    release = grobid_parse_unstructured(args.raw_citation) +    if not release: +        print("Did not parse") +        sys.exit(-1) +    else: +        print(entity_to_dict(release)) +    print("## Fuzzy matching...", file=sys.stderr) +    closest = closest_fuzzy_release_match(release, es_client=es_client) +    if not closest: +        print("Did not match/verify") +        sys.exit(-1) +    print(f"{closest.status.name}\t{closest.reason.name}\trelease_{closest.release.ident}") + +  if __name__ == '__main__':      logging.basicConfig(          level=logging.DEBUG, @@ -277,6 +304,16 @@ if __name__ == '__main__':      )      sub_release_match.set_defaults(func=run_release_match) +    sub_unstructured = subparsers.add_parser("unstructured", +                                             help="parse and match unstructured citation string", +                                             formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    sub_unstructured.add_argument( +        "raw_citation", +        help="unstructured/raw citation string", +        type=str, +    ) +    sub_unstructured.set_defaults(func=run_unstructured) +      args = parser.parse_args()      if not args.__dict__.get("func"):          print(__doc__, file=sys.stderr) diff --git a/fuzzycat/grobid2json.py b/fuzzycat/grobid2json.py new file mode 100755 index 0000000..c5aa0d2 --- /dev/null +++ b/fuzzycat/grobid2json.py @@ -0,0 +1,213 @@ +""" +This file originally copied from the webgroup/sandcrawler repository. + +Parse GROBID TEI-XML and extract metadata into simple dict format. In +particular, for fuzzycat, used with GROBID to parse raw ("unstructured") +citation strings. +""" + +import argparse +import io +import json +import xml.etree.ElementTree as ET +from typing import Any, AnyStr, Dict, List, Optional + +xml_ns = "http://www.w3.org/XML/1998/namespace" +ns = "http://www.tei-c.org/ns/1.0" + + +def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any]]: +    if not elem: +        return [] +    names = [] +    for author in elem.findall(".//{%s}author" % ns): +        pn = author.find("./{%s}persName" % ns) +        if not pn: +            continue +        given_name = pn.findtext("./{%s}forename" % ns) or None +        surname = pn.findtext("./{%s}surname" % ns) or None +        full_name = " ".join(pn.itertext()).strip() +        full_name = " ".join(full_name.split()) +        obj: Dict[str, Any] = dict(name=full_name) +        if given_name: +            obj["given_name"] = given_name +        if surname: +            obj["surname"] = surname +        ae = author.find("./{%s}affiliation" % ns) +        if ae: +            affiliation: Dict[str, Any] = dict() +            for on in ae.findall("./{%s}orgName" % ns): +                on_type = on.get("type") +                if on_type: +                    affiliation[on_type] = on.text +            addr_e = ae.find("./{%s}address" % ns) +            if addr_e: +                address = dict() +                for t in addr_e.getchildren(): +                    address[t.tag.split("}")[-1]] = t.text +                if address: +                    affiliation["address"] = address +                # affiliation['address'] = { +                #    'post_code': addr.findtext('./{%s}postCode' % ns) or None, +                #    'settlement': addr.findtext('./{%s}settlement' % ns) or None, +                #    'country': addr.findtext('./{%s}country' % ns) or None, +                # } +            obj["affiliation"] = affiliation +        names.append(obj) +    return names + + +def journal_info(elem: ET.Element) -> Dict[str, Any]: +    journal = dict() +    journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") +    journal["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") +    if journal["publisher"] == "": +        journal["publisher"] = None +    journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) +    journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns) +    journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) +    journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) +    keys = list(journal.keys()) + +    # remove empty/null keys +    for k in keys: +        if not journal[k]: +            journal.pop(k) +    return journal + + +def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: +    ref: Dict[str, Any] = dict() +    ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") +    ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) +    # Title stuff is messy in references... +    ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") +    other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") +    if other_title: +        if ref["title"]: +            ref["journal"] = other_title +        else: +            ref["journal"] = None +            ref["title"] = other_title +    ref["authors"] = all_authors(elem, ns=ns) +    ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") +    if not ref["publisher"]: +        ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") +    if ref["publisher"] == "": +        ref["publisher"] = None +    date = elem.find('.//{%s}date[@type="published"]' % ns) +    ref["date"] = (date is not None) and date.attrib.get("when") +    ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) +    ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) +    ref["doi"] = elem.findtext('.//{%s}idno[@type="DOI"]' % ns) +    ref["arxiv_id"] = elem.findtext('.//{%s}idno[@type="arXiv"]' % ns) +    if ref["arxiv_id"] and ref["arxiv_id"].startswith("arXiv:"): +        ref["arxiv_id"] = ref["arxiv_id"][6:] +    ref["pmcid"] = elem.findtext('.//{%s}idno[@type="PMCID"]' % ns) +    ref["pmid"] = elem.findtext('.//{%s}idno[@type="PMID"]' % ns) +    el = elem.find('.//{%s}biblScope[@unit="page"]' % ns) +    if el is not None: +        if el.attrib.get("from") and el.attrib.get("to"): +            ref["pages"] = "{}-{}".format(el.attrib["from"], el.attrib["to"]) +        else: +            ref["pages"] = el.text +    el = elem.find(".//{%s}ptr[@target]" % ns) +    if el is not None: +        ref["url"] = el.attrib["target"] +        # Hand correction +        if ref["url"].endswith(".Lastaccessed"): +            ref["url"] = ref["url"].replace(".Lastaccessed", "") +        if ref["url"].startswith("<"): +            ref["url"] = ref["url"][1:] +        if ">" in ref["url"]: +            ref["url"] = ref["url"].split(">")[0] +    else: +        ref["url"] = None +    return ref + + +def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: + +    if isinstance(content, str): +        tree = ET.parse(io.StringIO(content)) +    elif isinstance(content, bytes): +        tree = ET.parse(io.BytesIO(content)) + +    info: Dict[str, Any] = dict() + +    # print(content) +    # print(content.getvalue()) +    tei = tree.getroot() + +    header = tei.find(".//{%s}teiHeader" % ns) +    if header is None: +        raise ValueError("XML does not look like TEI format") +    application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0] +    info["grobid_version"] = application_tag.attrib["version"].strip() +    info["grobid_timestamp"] = application_tag.attrib["when"].strip() +    info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") +    info["authors"] = all_authors(header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) +    info["journal"] = journal_info(header) +    date = header.find('.//{%s}date[@type="published"]' % ns) +    info["date"] = (date is not None) and date.attrib.get("when") +    info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) +    info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) +    if info["doi"]: +        info["doi"] = info["doi"].lower() + +    refs = [] +    for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): +        ref = biblio_info(bs) +        ref["index"] = i +        refs.append(ref) +    info["citations"] = refs + +    text = tei.find(".//{%s}text" % (ns)) +    # print(text.attrib) +    if text and text.attrib.get("{%s}lang" % xml_ns): +        info["language_code"] = text.attrib["{%s}lang" % xml_ns]  # xml:lang + +    if encumbered: +        el = tei.find(f".//{{{ns}}}profileDesc/{{{ns}}}abstract") +        info["abstract"] = (el or None) and " ".join(el.itertext()).strip() +        el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") +        info["body"] = (el or None) and " ".join(el.itertext()).strip() +        el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') +        info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip() +        el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') +        info["annex"] = (el or None) and " ".join(el.itertext()).strip() + +    # remove empty/null keys +    keys = list(info.keys()) +    for k in keys: +        if not info[k]: +            info.pop(k) +    return info + + +def main() -> None:  # pragma no cover +    parser = argparse.ArgumentParser( +        formatter_class=argparse.ArgumentDefaultsHelpFormatter, +        description="GROBID TEI XML to JSON", +        usage="%(prog)s [options] <teifile>...", +    ) +    parser.add_argument( +        "--no-encumbered", +        action="store_true", +        help="don't include ambiguously copyright encumbered fields (eg, abstract, body)", +    ) +    parser.add_argument("teifiles", nargs="+") + +    args = parser.parse_args() + +    for filename in args.teifiles: +        content = open(filename, "r").read() +        print( +            json.dumps( +                teixml2json(content, encumbered=(not args.no_encumbered)), +                sort_keys=True, +            )) + + +if __name__ == "__main__":  # pragma no cover +    main() diff --git a/fuzzycat/grobid_unstructured.py b/fuzzycat/grobid_unstructured.py new file mode 100644 index 0000000..4f09bce --- /dev/null +++ b/fuzzycat/grobid_unstructured.py @@ -0,0 +1,126 @@ +""" +Helper functions to parse an unstructured citation string using GROBID, then +fuzzy match using the result. + +- try to parse string with GROBID REST API call +- transform the GROBID XML response to a simple dict/struct + +TODO: more general versions which handle multiple reference strings in a batch? +""" + +import io +import sys +import xml.etree.ElementTree as ET +from typing import Any, Optional, Tuple + +import requests +from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds + +from fuzzycat.config import settings +from fuzzycat.grobid2json import biblio_info + +GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki") + + +def grobid_api_process_citation(raw_citation: str, +                                grobid_api_base: str = GROBID_API_BASE, +                                timeout: float = 20.0) -> Optional[str]: +    """ +    Process a single citation string using GROBID API, returning a TEI-XML response. + +    Raises python TimeoutError if there was a network or request timeout. + +    Raises a 'requests' error other unexpected failures (including network +    connection failures) +    """ +    try: +        grobid_response = requests.post( +            grobid_api_base + "/api/processCitation", +            data={ +                "citations": raw_citation, +                "consolidateCitations": 0, +            }, +            timeout=timeout, +        ) +    except requests.Timeout: +        raise TimeoutError("GROBID request (HTTP POST) timeout") + +    if grobid_response.status_code == 204: +        return None +    elif grobid_response.status_code != 200: +        print(f"GROBID request (HTTP POST) failed: {grobid_response}", file=sys.stderr) +    grobid_response.raise_for_status() + +    return grobid_response.text or None + + +def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: +    """ +    Parses GROBID XML for the case of a single reference/citation string (eg, +    not a full/propper TEI-XML fulltext document), and returns a dict. +    """ +    # first, remove any xmlns stuff, for consistent parsign +    raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") +    tree = ET.parse(io.StringIO(raw_xml)) +    root = tree.getroot() +    ref = biblio_info(root, ns="") +    if not any(ref.values()): +        return None +    return ref + + +def grobid_ref_to_release(ref: dict) -> ReleaseEntity: +    """ +    Takes the dict returned by transform_grobid_ref_xml() and returns a partial +    ReleaseEntity object (for use with fuzzycat) +    """ +    contribs = [] +    for author in ref.get("authors") or []: +        contribs.append( +            ReleaseContrib( +                raw_name=author.get("name"), +                given_name=author.get("given_name"), +                surname=author.get("surname"), +            )) +    release = ReleaseEntity( +        title=ref.get("title"), +        contribs=contribs, +        volume=ref.get("volume"), +        issue=ref.get("issue"), +        pages=ref.get("pages"), +        ext_ids=ReleaseExtIds( +            doi=ref.get("doi"), +            pmid=ref.get("pmid"), +            pmcid=ref.get("pmcid"), +            arxiv=ref.get("arxiv_id"), +        ), +    ) +    if ref.get("journal"): +        release.extra = {"container_name": ref.get("journal")} +    if ref.get("date"): +        if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit(): +            release.release_year = int(ref["date"][0:4]) +        # TODO: try to parse 'date' into an ISO date format, and assign to release_date? +    return release + + +def grobid_parse_unstructured(raw_citation: str, +                              grobid_api_base: str = GROBID_API_BASE, +                              timeout: float = 20.0) -> Optional[ReleaseEntity]: +    """ +    High-level wrapper to parse a raw citation string into a (partial) release +    entity. +     +    Returns None if it fails to parse. + +    Raises various exceptions on network or remote errors. +    """ +    ref_xml = grobid_api_process_citation(raw_citation, +                                          grobid_api_base=grobid_api_base, +                                          timeout=timeout) +    if not ref_xml: +        return None +    biblio_dict = transform_grobid_ref_xml(ref_xml) +    if not biblio_dict: +        return None +    return grobid_ref_to_release(biblio_dict) diff --git a/fuzzycat/simple.py b/fuzzycat/simple.py new file mode 100644 index 0000000..c78ac28 --- /dev/null +++ b/fuzzycat/simple.py @@ -0,0 +1,274 @@ +""" +This file contains simple high-level functions that call in to match, verify, +and unstructured parsing routines. + +    close_fuzzy_release_matches(release) -> List[FuzzyReleaseMatchResult] +    close_fuzzy_biblio_matches(biblio) -> List[FuzzyReleaseMatchResult] +    close_fuzzy_unstructured_matches(unstructured) -> List[FuzzyReleaseMatchResult] + +Each function takes additional arguments: + +    es_client +    fatcat_api_client +    match_limit + +Each also has a "closest" variant, which returns just the single highest-rated +match. +""" + +from dataclasses import dataclass +from typing import Any, List, Optional + +from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds + +from fuzzycat.common import Reason, Status +from fuzzycat.entities import entity_to_dict +from fuzzycat.grobid_unstructured import grobid_parse_unstructured +from fuzzycat.matching import match_release_fuzzy +from fuzzycat.verify import verify + + +@dataclass +class FuzzyReleaseMatchResult: +    status: Status +    reason: Reason +    release: ReleaseEntity + + +# this map used to establish priority order of verified matches +STATUS_SORT = { +    Status.TODO: 0, +    Status.EXACT: 10, +    Status.STRONG: 20, +    Status.WEAK: 30, +    Status.AMBIGUOUS: 40, +    Status.DIFFERENT: 60, +} + + +def close_fuzzy_release_matches(release: ReleaseEntity, +                                es_client: Any, +                                fatcat_api_client: Optional[Any] = None, +                                match_limit: int = 5) -> Optional[FuzzyReleaseMatchResult]: +    """ +    This high-level helper function runs a fuzzy match (using elasticsearch), +    verifies all the results, and returns the "closest" matching results (if +    any). + +    es_client is required, and used in the matcing process. + +    fatcat_api_client is optional and used both for entity-to-dict conversion +    efficiency and for fetching current entities from the fatcat API + +    match_limit sets the maximum result size from the inital fuzzy match call + +    Returns an empty list if there was no match of any kind, or a sorted list +    of simple result objects (FuzzyReleaseMatchResult dataclass) with fields: + +        status: fuzzycat.common.Status +        reason: fuzzycat.common.Reason +        release: ReleaseEntity + +    Status is one of the fuzzycat.common.Status, with "strongest match" in this +    sorted order: + +    - EXACT +    - STRONG +    - WEAK +    - AMBIGUOUS + +    DIFFERENT and TODO matches are never returned. + +    Eg, if there is any EXACT match that is always returned; an AMBIGIOUS +    result is only returned if all the candidate matches were ambiguous. +    """ + +    candidates = match_release_fuzzy(release, size=match_limit, es=es_client) +    if not candidates: +        return None + +    release_dict = entity_to_dict(release, api_client=fatcat_api_client) + +    # list of tuple of (Verify, ReleaseEntity) +    verified = [( +        verify(release_dict, entity_to_dict(c, api_client=fatcat_api_client)), +        c, +    ) for c in candidates] + +    # list of FuzzyReleaseMatchResult, with TODO and DIFFERENT removed +    verified = [ +        FuzzyReleaseMatchResult(v[0].status, v[0].reason, v[1]) for v in verified +        if v[0].status not in [Status.TODO, Status.DIFFERENT] +    ] + +    return sorted(verified, key=lambda v: STATUS_SORT[v.status]) + + +def closest_fuzzy_release_match(release: ReleaseEntity, +                                es_client: Any, +                                fatcat_api_client: Optional[Any] = None, +                                match_limit: int = 5) -> Optional[FuzzyReleaseMatchResult]: +    """ +    Single-result variant of close_fuzzy_release_matches() +    """ +    matches = close_fuzzy_release_matches( +        release, +        es_client=es_client, +        fatcat_api_client=fatcat_api_client, +        match_limit=match_limit, +    ) +    if matches: +        return matches[0] +    else: +        return None + + +def close_fuzzy_unstructured_matches(raw_citation: str, +                                     es_client: Any, +                                     fatcat_api_client: Optional[Any] = None, +                                     match_limit: int = 5) -> List[FuzzyReleaseMatchResult]: +    """ +    Variation of close_fuzzy_release_matches() which first parses an +    unstructured citation string, then finds close matches. + +    TODO: pass-through GROBID API configuration? +    """ +    release = grobid_parse_unstructured(raw_citation) +    if not release: +        return None +    return close_fuzzy_release_matches( +        release, +        es_client=es_client, +        fatcat_api_client=fatcat_api_client, +        match_limit=match_limit, +    ) + + +def closest_fuzzy_unstructured_match(raw_citation: str, +                                     es_client: Any, +                                     fatcat_api_client: Optional[Any] = None, +                                     match_limit: int = 5) -> List[FuzzyReleaseMatchResult]: +    """ +    Single-result variant of close_fuzzy_release_matches() +    """ +    matches = close_fuzzy_unstructured_matches( +        raw_citation, +        es_client=es_client, +        fatcat_api_client=fatcat_api_client, +        match_limit=match_limit, +    ) +    if matches: +        return matches[0] +    else: +        return None + + +def biblio_to_release(biblio: dict) -> ReleaseEntity: +    """ +    Helper for close_fuzzy_biblio_matches() et al +    """ +    contribs = [] +    if biblio.get('authors'): +        for a in biblio['authors']: +            contribs.append( +                ReleaseContrib( +                    raw_name=a.get('name'), +                    given_name=a.get('given_name'), +                    surname=a.get('surname'), +                )) +    elif biblio.get('author_names'): +        for a in biblio['author_names']: +            contribs.append(ReleaseContrib(raw_name=a)) +    elif biblio.get('first_author'): +        contribs.append(ReleaseContrib(raw_name=biblio['first_author'])) +    release = ReleaseEntity( +        title=biblio.get("title"), +        ext_ids=ReleaseExtIds( +            doi=biblio.get("doi"), +            pmid=biblio.get("pmid"), +            pmcid=biblio.get("pmcid"), +            arxiv=biblio.get("arxiv_id"), +        ), +        volume=biblio.get("volume"), +        issue=biblio.get("issue"), +        pages=biblio.get("pages") or biblio.get("first_page"), +        publisher=biblio.get("publisher"), +        release_stage=biblio.get("release_stage"), +        release_type=biblio.get("release_type"), +        extra=dict(), +    ) +    if biblio.get('journal'): +        release.extra['container_name'] = biblio['journal'] +    elif biblio.get('conference'): +        release.extra['container_name'] = biblio['conference'] +    if biblio.get('year'): +        year = biblio['year'] +        if isinstance(year, str) and len(year) >= 4 and year[0:4].isdigit(): +            release.release_year = int(year[0:4]) +        elif isinstance(year, int): +            release.release_year = year +    elif biblio.get('date'): +        date = biblio['date'] +        if isinstance(date, str) and len(date) >= 4 and date[0:4].isdigit(): +            release.release_year = int(date[0:4]) +    return release + + +def close_fuzzy_biblio_matches(biblio: dict, +                               es_client: Any, +                               fatcat_api_client: Optional[Any] = None, +                               match_limit: int = 5) -> List[FuzzyReleaseMatchResult]: +    """ +    Variation of close_fuzzy_release_matches() which takes bibliographic fields +    as arguments. + +    Biblio fields which are handled include: + +        title +        journal +        or: conference +        authors +            name +            given_name +            surname +        or: author_names (List[str]) +        or: first_author (str) +        year +        date +        volume +        issue +        pages +        or: first_page +        publisher +        doi +        pmid +        arxiv_id +        release_type (eg, 'journal-article', 'book', 'dataset') +        release_stage +    """ +    release = biblio_to_release(biblio) +    return close_fuzzy_release_matches( +        release, +        es_client=es_client, +        fatcat_api_client=fatcat_api_client, +        match_limit=match_limit, +    ) + + +def closest_fuzzy_biblio_match(biblio: dict, +                               es_client: Any, +                               fatcat_api_client: Optional[Any] = None, +                               match_limit: int = 5) -> List[FuzzyReleaseMatchResult]: +    """ +    Single-result variant of close_fuzzy_biblio_matches() +    """ +    matches = close_fuzzy_biblio_matches( +        biblio, +        es_client=es_client, +        fatcat_api_client=fatcat_api_client, +        match_limit=match_limit, +    ) +    if matches: +        return matches[0] +    else: +        return None | 
