Merge branch 'bnewbold-upstreaming' into 'master'

refactoring/upstreaming fuzzycat "live" matching helpers See merge request webgroup/fuzzycat!2
author: Martin Czygan <martin@archive.org> 2021-04-15 14:11:09 +0000
committer: Martin Czygan <martin@archive.org> 2021-04-15 14:11:09 +0000
commit: b27c43071ab021e9595457999359009cfd7a1abb (patch)
tree: e00199889528c00f777f5bbc908d0962760fb96f
parent: 8a17311c9516e63aeb31111647fdf21083bcf928 (diff)
parent: d44a9e421edfec2cac16048b67e6809cae8cdd18 (diff)
download: fuzzycat-b27c43071ab021e9595457999359009cfd7a1abb.tar.gz
fuzzycat-b27c43071ab021e9595457999359009cfd7a1abb.zip
6 files changed, 823 insertions, 1 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py
index 9121bd8..43691e8 100644
--- a/fuzzycat/__main__.py
+++ b/fuzzycat/__main__.py
@@ -8,6 +8,7 @@ COMMANDS
     verify_single
     verify_ref
     release_match
+    unstructured
 
   Run, e.g. fuzzycat cluster --help for more options.
 
@@ -45,7 +46,7 @@ EXAMPLES
 
   Release match (non-bulk).
 
-      $ python -m fuzzycat release_match -q "hello world"
+      $ python -m fuzzycat release_match --value "hello world"
 
       TODO: Elasticsearch might not respond to POST queries (which is what the
       client library uses, see: https://git.io/JLssk).
@@ -63,6 +64,7 @@ import random
 import sys
 import tempfile
 
+import elasticsearch
 import requests
 from fatcat_openapi_client import ReleaseEntity
 
@@ -70,8 +72,10 @@ from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngra
                               release_key_title_normalized, release_key_title_nysiis,
                               release_key_title_sandcrawler)
 from fuzzycat.entities import entity_to_dict
+from fuzzycat.grobid_unstructured import grobid_parse_unstructured
 from fuzzycat.matching import anything_to_entity, match_release_fuzzy
 from fuzzycat.refs import RefsGroupVerifier
+from fuzzycat.simple import closest_fuzzy_release_match
 from fuzzycat.utils import random_idents_from_query, random_word
 from fuzzycat.verify import GroupVerifier, verify
 
@@ -200,6 +204,29 @@ def run_release_match(args):
             print(json.dumps(vs))
 
 
+def run_unstructured(args):
+    """
+    Given a raw citation string, parse it and find "closest" match.
+
+    Uses lower-level routines instead of simple.closest_fuzzy_unstructured_match(raw_citation)
+    """
+    es_client = elasticsearch.Elasticsearch(args.es_url)
+
+    print("## Sending to GROBID...", file=sys.stderr)
+    release = grobid_parse_unstructured(args.raw_citation)
+    if not release:
+        print("Did not parse")
+        sys.exit(-1)
+    else:
+        print(entity_to_dict(release))
+    print("## Fuzzy matching...", file=sys.stderr)
+    closest = closest_fuzzy_release_match(release, es_client=es_client)
+    if not closest:
+        print("Did not match/verify")
+        sys.exit(-1)
+    print(f"{closest.status.name}\t{closest.reason.name}\trelease_{closest.release.ident}")
+
+
 if __name__ == '__main__':
     logging.basicConfig(
         level=logging.DEBUG,
@@ -277,6 +304,16 @@ if __name__ == '__main__':
     )
     sub_release_match.set_defaults(func=run_release_match)
 
+    sub_unstructured = subparsers.add_parser("unstructured",
+                                             help="parse and match unstructured citation string",
+                                             formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    sub_unstructured.add_argument(
+        "raw_citation",
+        help="unstructured/raw citation string",
+        type=str,
+    )
+    sub_unstructured.set_defaults(func=run_unstructured)
+
     args = parser.parse_args()
     if not args.__dict__.get("func"):
         print(__doc__, file=sys.stderr)
diff --git a/fuzzycat/grobid2json.py b/fuzzycat/grobid2json.py
new file mode 100755
index 0000000..c5aa0d2
--- /dev/null
+++ b/fuzzycat/grobid2json.py
@@ -0,0 +1,213 @@
+"""
+This file originally copied from the webgroup/sandcrawler repository.
+
+Parse GROBID TEI-XML and extract metadata into simple dict format. In
+particular, for fuzzycat, used with GROBID to parse raw ("unstructured")
+citation strings.
+"""
+
+import argparse
+import io
+import json
+import xml.etree.ElementTree as ET
+from typing import Any, AnyStr, Dict, List, Optional
+
+xml_ns = "http://www.w3.org/XML/1998/namespace"
+ns = "http://www.tei-c.org/ns/1.0"
+
+
+def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any]]:
+    if not elem:
+        return []
+    names = []
+    for author in elem.findall(".//{%s}author" % ns):
+        pn = author.find("./{%s}persName" % ns)
+        if not pn:
+            continue
+        given_name = pn.findtext("./{%s}forename" % ns) or None
+        surname = pn.findtext("./{%s}surname" % ns) or None
+        full_name = " ".join(pn.itertext()).strip()
+        full_name = " ".join(full_name.split())
+        obj: Dict[str, Any] = dict(name=full_name)
+        if given_name:
+            obj["given_name"] = given_name
+        if surname:
+            obj["surname"] = surname
+        ae = author.find("./{%s}affiliation" % ns)
+        if ae:
+            affiliation: Dict[str, Any] = dict()
+            for on in ae.findall("./{%s}orgName" % ns):
+                on_type = on.get("type")
+                if on_type:
+                    affiliation[on_type] = on.text
+            addr_e = ae.find("./{%s}address" % ns)
+            if addr_e:
+                address = dict()
+                for t in addr_e.getchildren():
+                    address[t.tag.split("}")[-1]] = t.text
+                if address:
+                    affiliation["address"] = address
+                # affiliation['address'] = {
+                #    'post_code': addr.findtext('./{%s}postCode' % ns) or None,
+                #    'settlement': addr.findtext('./{%s}settlement' % ns) or None,
+                #    'country': addr.findtext('./{%s}country' % ns) or None,
+                # }
+            obj["affiliation"] = affiliation
+        names.append(obj)
+    return names
+
+
+def journal_info(elem: ET.Element) -> Dict[str, Any]:
+    journal = dict()
+    journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title")
+    journal["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
+    if journal["publisher"] == "":
+        journal["publisher"] = None
+    journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
+    journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
+    journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
+    journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
+    keys = list(journal.keys())
+
+    # remove empty/null keys
+    for k in keys:
+        if not journal[k]:
+            journal.pop(k)
+    return journal
+
+
+def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]:
+    ref: Dict[str, Any] = dict()
+    ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
+    ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns)
+    # Title stuff is messy in references...
+    ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title")
+    other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title")
+    if other_title:
+        if ref["title"]:
+            ref["journal"] = other_title
+        else:
+            ref["journal"] = None
+            ref["title"] = other_title
+    ref["authors"] = all_authors(elem, ns=ns)
+    ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
+    if not ref["publisher"]:
+        ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher")
+    if ref["publisher"] == "":
+        ref["publisher"] = None
+    date = elem.find('.//{%s}date[@type="published"]' % ns)
+    ref["date"] = (date is not None) and date.attrib.get("when")
+    ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
+    ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
+    ref["doi"] = elem.findtext('.//{%s}idno[@type="DOI"]' % ns)
+    ref["arxiv_id"] = elem.findtext('.//{%s}idno[@type="arXiv"]' % ns)
+    if ref["arxiv_id"] and ref["arxiv_id"].startswith("arXiv:"):
+        ref["arxiv_id"] = ref["arxiv_id"][6:]
+    ref["pmcid"] = elem.findtext('.//{%s}idno[@type="PMCID"]' % ns)
+    ref["pmid"] = elem.findtext('.//{%s}idno[@type="PMID"]' % ns)
+    el = elem.find('.//{%s}biblScope[@unit="page"]' % ns)
+    if el is not None:
+        if el.attrib.get("from") and el.attrib.get("to"):
+            ref["pages"] = "{}-{}".format(el.attrib["from"], el.attrib["to"])
+        else:
+            ref["pages"] = el.text
+    el = elem.find(".//{%s}ptr[@target]" % ns)
+    if el is not None:
+        ref["url"] = el.attrib["target"]
+        # Hand correction
+        if ref["url"].endswith(".Lastaccessed"):
+            ref["url"] = ref["url"].replace(".Lastaccessed", "")
+        if ref["url"].startswith("<"):
+            ref["url"] = ref["url"][1:]
+        if ">" in ref["url"]:
+            ref["url"] = ref["url"].split(">")[0]
+    else:
+        ref["url"] = None
+    return ref
+
+
+def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]:
+
+    if isinstance(content, str):
+        tree = ET.parse(io.StringIO(content))
+    elif isinstance(content, bytes):
+        tree = ET.parse(io.BytesIO(content))
+
+    info: Dict[str, Any] = dict()
+
+    # print(content)
+    # print(content.getvalue())
+    tei = tree.getroot()
+
+    header = tei.find(".//{%s}teiHeader" % ns)
+    if header is None:
+        raise ValueError("XML does not look like TEI format")
+    application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0]
+    info["grobid_version"] = application_tag.attrib["version"].strip()
+    info["grobid_timestamp"] = application_tag.attrib["when"].strip()
+    info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title")
+    info["authors"] = all_authors(header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct"))
+    info["journal"] = journal_info(header)
+    date = header.find('.//{%s}date[@type="published"]' % ns)
+    info["date"] = (date is not None) and date.attrib.get("when")
+    info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
+    info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
+    if info["doi"]:
+        info["doi"] = info["doi"].lower()
+
+    refs = []
+    for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")):
+        ref = biblio_info(bs)
+        ref["index"] = i
+        refs.append(ref)
+    info["citations"] = refs
+
+    text = tei.find(".//{%s}text" % (ns))
+    # print(text.attrib)
+    if text and text.attrib.get("{%s}lang" % xml_ns):
+        info["language_code"] = text.attrib["{%s}lang" % xml_ns]  # xml:lang
+
+    if encumbered:
+        el = tei.find(f".//{{{ns}}}profileDesc/{{{ns}}}abstract")
+        info["abstract"] = (el or None) and " ".join(el.itertext()).strip()
+        el = tei.find(f".//{{{ns}}}text/{{{ns}}}body")
+        info["body"] = (el or None) and " ".join(el.itertext()).strip()
+        el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]')
+        info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip()
+        el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]')
+        info["annex"] = (el or None) and " ".join(el.itertext()).strip()
+
+    # remove empty/null keys
+    keys = list(info.keys())
+    for k in keys:
+        if not info[k]:
+            info.pop(k)
+    return info
+
+
+def main() -> None:  # pragma no cover
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="GROBID TEI XML to JSON",
+        usage="%(prog)s [options] <teifile>...",
+    )
+    parser.add_argument(
+        "--no-encumbered",
+        action="store_true",
+        help="don't include ambiguously copyright encumbered fields (eg, abstract, body)",
+    )
+    parser.add_argument("teifiles", nargs="+")
+
+    args = parser.parse_args()
+
+    for filename in args.teifiles:
+        content = open(filename, "r").read()
+        print(
+            json.dumps(
+                teixml2json(content, encumbered=(not args.no_encumbered)),
+                sort_keys=True,
+            ))
+
+
+if __name__ == "__main__":  # pragma no cover
+    main()
diff --git a/fuzzycat/grobid_unstructured.py b/fuzzycat/grobid_unstructured.py
new file mode 100644
index 0000000..4f09bce
--- /dev/null
+++ b/fuzzycat/grobid_unstructured.py
@@ -0,0 +1,126 @@
+"""
+Helper functions to parse an unstructured citation string using GROBID, then
+fuzzy match using the result.
+
+- try to parse string with GROBID REST API call
+- transform the GROBID XML response to a simple dict/struct
+
+TODO: more general versions which handle multiple reference strings in a batch?
+"""
+
+import io
+import sys
+import xml.etree.ElementTree as ET
+from typing import Any, Optional, Tuple
+
+import requests
+from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds
+
+from fuzzycat.config import settings
+from fuzzycat.grobid2json import biblio_info
+
+GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki")
+
+
+def grobid_api_process_citation(raw_citation: str,
+                                grobid_api_base: str = GROBID_API_BASE,
+                                timeout: float = 20.0) -> Optional[str]:
+    """
+    Process a single citation string using GROBID API, returning a TEI-XML response.
+
+    Raises python TimeoutError if there was a network or request timeout.
+
+    Raises a 'requests' error other unexpected failures (including network
+    connection failures)
+    """
+    try:
+        grobid_response = requests.post(
+            grobid_api_base + "/api/processCitation",
+            data={
+                "citations": raw_citation,
+                "consolidateCitations": 0,
+            },
+            timeout=timeout,
+        )
+    except requests.Timeout:
+        raise TimeoutError("GROBID request (HTTP POST) timeout")
+
+    if grobid_response.status_code == 204:
+        return None
+    elif grobid_response.status_code != 200:
+        print(f"GROBID request (HTTP POST) failed: {grobid_response}", file=sys.stderr)
+    grobid_response.raise_for_status()
+
+    return grobid_response.text or None
+
+
+def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]:
+    """
+    Parses GROBID XML for the case of a single reference/citation string (eg,
+    not a full/propper TEI-XML fulltext document), and returns a dict.
+    """
+    # first, remove any xmlns stuff, for consistent parsign
+    raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
+    tree = ET.parse(io.StringIO(raw_xml))
+    root = tree.getroot()
+    ref = biblio_info(root, ns="")
+    if not any(ref.values()):
+        return None
+    return ref
+
+
+def grobid_ref_to_release(ref: dict) -> ReleaseEntity:
+    """
+    Takes the dict returned by transform_grobid_ref_xml() and returns a partial
+    ReleaseEntity object (for use with fuzzycat)
+    """
+    contribs = []
+    for author in ref.get("authors") or []:
+        contribs.append(
+            ReleaseContrib(
+                raw_name=author.get("name"),
+                given_name=author.get("given_name"),
+                surname=author.get("surname"),
+            ))
+    release = ReleaseEntity(
+        title=ref.get("title"),
+        contribs=contribs,
+        volume=ref.get("volume"),
+        issue=ref.get("issue"),
+        pages=ref.get("pages"),
+        ext_ids=ReleaseExtIds(
+            doi=ref.get("doi"),
+            pmid=ref.get("pmid"),
+            pmcid=ref.get("pmcid"),
+            arxiv=ref.get("arxiv_id"),
+        ),
+    )
+    if ref.get("journal"):
+        release.extra = {"container_name": ref.get("journal")}
+    if ref.get("date"):
+        if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit():
+            release.release_year = int(ref["date"][0:4])
+        # TODO: try to parse 'date' into an ISO date format, and assign to release_date?
+    return release
+
+
+def grobid_parse_unstructured(raw_citation: str,
+                              grobid_api_base: str = GROBID_API_BASE,
+                              timeout: float = 20.0) -> Optional[ReleaseEntity]:
+    """
+    High-level wrapper to parse a raw citation string into a (partial) release
+    entity.
+    
+    Returns None if it fails to parse.
+
+    Raises various exceptions on network or remote errors.
+    """
+    ref_xml = grobid_api_process_citation(raw_citation,
+                                          grobid_api_base=grobid_api_base,
+                                          timeout=timeout)
+    if not ref_xml:
+        return None
+    biblio_dict = transform_grobid_ref_xml(ref_xml)
+    if not biblio_dict:
+        return None
+    return grobid_ref_to_release(biblio_dict)
diff --git a/fuzzycat/simple.py b/fuzzycat/simple.py
new file mode 100644
index 0000000..c78ac28
--- /dev/null
+++ b/fuzzycat/simple.py
@@ -0,0 +1,274 @@
+"""
+This file contains simple high-level functions that call in to match, verify,
+and unstructured parsing routines.
+
+    close_fuzzy_release_matches(release) -> List[FuzzyReleaseMatchResult]
+    close_fuzzy_biblio_matches(biblio) -> List[FuzzyReleaseMatchResult]
+    close_fuzzy_unstructured_matches(unstructured) -> List[FuzzyReleaseMatchResult]
+
+Each function takes additional arguments:
+
+    es_client
+    fatcat_api_client
+    match_limit
+
+Each also has a "closest" variant, which returns just the single highest-rated
+match.
+"""
+
+from dataclasses import dataclass
+from typing import Any, List, Optional
+
+from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds
+
+from fuzzycat.common import Reason, Status
+from fuzzycat.entities import entity_to_dict
+from fuzzycat.grobid_unstructured import grobid_parse_unstructured
+from fuzzycat.matching import match_release_fuzzy
+from fuzzycat.verify import verify
+
+
+@dataclass
+class FuzzyReleaseMatchResult:
+    status: Status
+    reason: Reason
+    release: ReleaseEntity
+
+
+# this map used to establish priority order of verified matches
+STATUS_SORT = {
+    Status.TODO: 0,
+    Status.EXACT: 10,
+    Status.STRONG: 20,
+    Status.WEAK: 30,
+    Status.AMBIGUOUS: 40,
+    Status.DIFFERENT: 60,
+}
+
+
+def close_fuzzy_release_matches(release: ReleaseEntity,
+                                es_client: Any,
+                                fatcat_api_client: Optional[Any] = None,
+                                match_limit: int = 5) -> Optional[FuzzyReleaseMatchResult]:
+    """
+    This high-level helper function runs a fuzzy match (using elasticsearch),
+    verifies all the results, and returns the "closest" matching results (if
+    any).
+
+    es_client is required, and used in the matcing process.
+
+    fatcat_api_client is optional and used both for entity-to-dict conversion
+    efficiency and for fetching current entities from the fatcat API
+
+    match_limit sets the maximum result size from the inital fuzzy match call
+
+    Returns an empty list if there was no match of any kind, or a sorted list
+    of simple result objects (FuzzyReleaseMatchResult dataclass) with fields:
+
+        status: fuzzycat.common.Status
+        reason: fuzzycat.common.Reason
+        release: ReleaseEntity
+
+    Status is one of the fuzzycat.common.Status, with "strongest match" in this
+    sorted order:
+
+    - EXACT
+    - STRONG
+    - WEAK
+    - AMBIGUOUS
+
+    DIFFERENT and TODO matches are never returned.
+
+    Eg, if there is any EXACT match that is always returned; an AMBIGIOUS
+    result is only returned if all the candidate matches were ambiguous.
+    """
+
+    candidates = match_release_fuzzy(release, size=match_limit, es=es_client)
+    if not candidates:
+        return None
+
+    release_dict = entity_to_dict(release, api_client=fatcat_api_client)
+
+    # list of tuple of (Verify, ReleaseEntity)
+    verified = [(
+        verify(release_dict, entity_to_dict(c, api_client=fatcat_api_client)),
+        c,
+    ) for c in candidates]
+
+    # list of FuzzyReleaseMatchResult, with TODO and DIFFERENT removed
+    verified = [
+        FuzzyReleaseMatchResult(v[0].status, v[0].reason, v[1]) for v in verified
+        if v[0].status not in [Status.TODO, Status.DIFFERENT]
+    ]
+
+    return sorted(verified, key=lambda v: STATUS_SORT[v.status])
+
+
+def closest_fuzzy_release_match(release: ReleaseEntity,
+                                es_client: Any,
+                                fatcat_api_client: Optional[Any] = None,
+                                match_limit: int = 5) -> Optional[FuzzyReleaseMatchResult]:
+    """
+    Single-result variant of close_fuzzy_release_matches()
+    """
+    matches = close_fuzzy_release_matches(
+        release,
+        es_client=es_client,
+        fatcat_api_client=fatcat_api_client,
+        match_limit=match_limit,
+    )
+    if matches:
+        return matches[0]
+    else:
+        return None
+
+
+def close_fuzzy_unstructured_matches(raw_citation: str,
+                                     es_client: Any,
+                                     fatcat_api_client: Optional[Any] = None,
+                                     match_limit: int = 5) -> List[FuzzyReleaseMatchResult]:
+    """
+    Variation of close_fuzzy_release_matches() which first parses an
+    unstructured citation string, then finds close matches.
+
+    TODO: pass-through GROBID API configuration?
+    """
+    release = grobid_parse_unstructured(raw_citation)
+    if not release:
+        return None
+    return close_fuzzy_release_matches(
+        release,
+        es_client=es_client,
+        fatcat_api_client=fatcat_api_client,
+        match_limit=match_limit,
+    )
+
+
+def closest_fuzzy_unstructured_match(raw_citation: str,
+                                     es_client: Any,
+                                     fatcat_api_client: Optional[Any] = None,
+                                     match_limit: int = 5) -> List[FuzzyReleaseMatchResult]:
+    """
+    Single-result variant of close_fuzzy_release_matches()
+    """
+    matches = close_fuzzy_unstructured_matches(
+        raw_citation,
+        es_client=es_client,
+        fatcat_api_client=fatcat_api_client,
+        match_limit=match_limit,
+    )
+    if matches:
+        return matches[0]
+    else:
+        return None
+
+
+def biblio_to_release(biblio: dict) -> ReleaseEntity:
+    """
+    Helper for close_fuzzy_biblio_matches() et al
+    """
+    contribs = []
+    if biblio.get('authors'):
+        for a in biblio['authors']:
+            contribs.append(
+                ReleaseContrib(
+                    raw_name=a.get('name'),
+                    given_name=a.get('given_name'),
+                    surname=a.get('surname'),
+                ))
+    elif biblio.get('author_names'):
+        for a in biblio['author_names']:
+            contribs.append(ReleaseContrib(raw_name=a))
+    elif biblio.get('first_author'):
+        contribs.append(ReleaseContrib(raw_name=biblio['first_author']))
+    release = ReleaseEntity(
+        title=biblio.get("title"),
+        ext_ids=ReleaseExtIds(
+            doi=biblio.get("doi"),
+            pmid=biblio.get("pmid"),
+            pmcid=biblio.get("pmcid"),
+            arxiv=biblio.get("arxiv_id"),
+        ),
+        volume=biblio.get("volume"),
+        issue=biblio.get("issue"),
+        pages=biblio.get("pages") or biblio.get("first_page"),
+        publisher=biblio.get("publisher"),
+        release_stage=biblio.get("release_stage"),
+        release_type=biblio.get("release_type"),
+        extra=dict(),
+    )
+    if biblio.get('journal'):
+        release.extra['container_name'] = biblio['journal']
+    elif biblio.get('conference'):
+        release.extra['container_name'] = biblio['conference']
+    if biblio.get('year'):
+        year = biblio['year']
+        if isinstance(year, str) and len(year) >= 4 and year[0:4].isdigit():
+            release.release_year = int(year[0:4])
+        elif isinstance(year, int):
+            release.release_year = year
+    elif biblio.get('date'):
+        date = biblio['date']
+        if isinstance(date, str) and len(date) >= 4 and date[0:4].isdigit():
+            release.release_year = int(date[0:4])
+    return release
+
+
+def close_fuzzy_biblio_matches(biblio: dict,
+                               es_client: Any,
+                               fatcat_api_client: Optional[Any] = None,
+                               match_limit: int = 5) -> List[FuzzyReleaseMatchResult]:
+    """
+    Variation of close_fuzzy_release_matches() which takes bibliographic fields
+    as arguments.
+
+    Biblio fields which are handled include:
+
+        title
+        journal
+        or: conference
+        authors
+            name
+            given_name
+            surname
+        or: author_names (List[str])
+        or: first_author (str)
+        year
+        date
+        volume
+        issue
+        pages
+        or: first_page
+        publisher
+        doi
+        pmid
+        arxiv_id
+        release_type (eg, 'journal-article', 'book', 'dataset')
+        release_stage
+    """
+    release = biblio_to_release(biblio)
+    return close_fuzzy_release_matches(
+        release,
+        es_client=es_client,
+        fatcat_api_client=fatcat_api_client,
+        match_limit=match_limit,
+    )
+
+
+def closest_fuzzy_biblio_match(biblio: dict,
+                               es_client: Any,
+                               fatcat_api_client: Optional[Any] = None,
+                               match_limit: int = 5) -> List[FuzzyReleaseMatchResult]:
+    """
+    Single-result variant of close_fuzzy_biblio_matches()
+    """
+    matches = close_fuzzy_biblio_matches(
+        biblio,
+        es_client=es_client,
+        fatcat_api_client=fatcat_api_client,
+        match_limit=match_limit,
+    )
+    if matches:
+        return matches[0]
+    else:
+        return None
diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py
new file mode 100644
index 0000000..dd69936
--- /dev/null
+++ b/tests/test_grobid_unstructured.py
@@ -0,0 +1,130 @@
+import pytest
+
+from fuzzycat.grobid_unstructured import grobid_api_process_citation, grobid_parse_unstructured, grobid_ref_to_release, transform_grobid_ref_xml
+
+
+def test_grobid_ref_to_release():
+
+    d = {
+        'title':
+        "some title",
+        'doi':
+        '10.1234/5678',
+        'journal':
+        'some journal',
+        'authors': [
+            {
+                'name': 'ahab sailor',
+                'given_name': 'ahab',
+                'surname': 'sailor'
+            },
+            {
+                'name': 'mary jane',
+                'given_name': 'mary',
+                'surname': 'jane'
+            },
+        ],
+    }
+    r = grobid_ref_to_release(d)
+    assert r.title == d['title']
+    assert r.ext_ids.doi == d['doi']
+    assert r.extra['container_name'] == d['journal']
+    assert r.contribs[0].surname == d['authors'][0]['surname']
+    assert r.contribs[1].raw_name == d['authors'][1]['name']
+
+
+def test_transform_grobid_ref_xml():
+    citation_xml = """
+<biblStruct >
+    <analytic>
+        <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">H</forename>
+                <forename type="middle">B</forename>
+                <surname>Cunningham</surname>
+            </persName>
+        </author>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">J</forename>
+                <forename type="middle">J</forename>
+                <surname>Weis</surname>
+            </persName>
+        </author>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">L</forename>
+                <forename type="middle">R</forename>
+                <surname>Taveras</surname>
+            </persName>
+        </author>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">S</forename>
+                <surname>Huerta</surname>
+            </persName>
+        </author>
+        <idno type="DOI">10.1007/s10029-019-01898-9</idno>
+        <idno type="PMID">30701369</idno>
+    </analytic>
+    <monogr>
+        <title level="j">Hernia</title>
+        <imprint>
+            <biblScope unit="volume">23</biblScope>
+            <biblScope unit="issue">2</biblScope>
+            <biblScope unit="page" from="235" to="243" />
+            <date type="published" when="2019-01-30" />
+        </imprint>
+    </monogr>
+</biblStruct>"""
+
+    d = transform_grobid_ref_xml(citation_xml)
+    assert d['title'] == "Mesh migration following abdominal hernia repair: a comprehensive review"
+    assert d['authors'][2]['given_name'] == "L"
+    assert d['authors'][2]['surname'] == "Taveras"
+    assert d['authors'][2]['name'] == "L R Taveras"
+    assert d['doi'] == "10.1007/s10029-019-01898-9"
+    assert d['pmid'] == "30701369"
+    assert d['date'] == "2019-01-30"
+    assert d['pages'] == "235-243"
+    assert d['volume'] == "23"
+    assert d['issue'] == "2"
+    assert d['journal'] == "Hernia"
+
+
+def test_grobid_parse_unstructured():
+    """
+    NOTE: this test makes live network requests to GROBID
+    """
+
+    r = grobid_parse_unstructured("blah")
+    assert r is None
+
+    r = grobid_parse_unstructured(
+        """Cunningham HB, Weis JJ, Taveras LR, Huerta S. Mesh migration following abdominal hernia repair: a comprehensive review. Hernia. 2019 Apr;23(2):235-243. doi: 10.1007/s10029-019-01898-9. Epub 2019 Jan 30. PMID: 30701369."""
+    )
+    assert r.title == "Mesh migration following abdominal hernia repair: a comprehensive review"
+    assert r.contribs[0].surname == "Cunningham"
+    assert r.contribs[1].surname == "Weis"
+    assert r.contribs[2].surname == "Taveras"
+    assert r.contribs[3].surname == "Huerta"
+    assert r.extra['container_name'] == "Hernia"
+    assert r.release_year == 2019
+    assert r.volume == "23"
+    assert r.issue == "2"
+    assert r.pages == "235-243"
+    assert r.ext_ids.doi == "10.1007/s10029-019-01898-9"
+    assert r.ext_ids.pmid == "30701369"
+
+
+def test_grobid_parse_unstructured_timeout():
+    """
+    NOTE: this test makes live network requests to GROBID
+    """
+    with pytest.raises(TimeoutError):
+        grobid_parse_unstructured("blah", timeout=0.000001)
diff --git a/tests/test_simple.py b/tests/test_simple.py
new file mode 100644
index 0000000..0c5d216
--- /dev/null
+++ b/tests/test_simple.py
@@ -0,0 +1,42 @@
+"""
+These basically all hit external network services.
+"""
+
+import pytest
+import elasticsearch
+
+from fuzzycat.simple import *
+from fuzzycat.config import settings
+
+
+@pytest.fixture
+def es_client():
+    return elasticsearch.Elasticsearch(
+        [settings.get("FATCAT_SEARCH_URL", "https://search.fatcat.wiki:443")])
+
+
+def test_close_fuzzy_unstructured_matches(es_client):
+
+    matches = close_fuzzy_unstructured_matches(
+        """Cunningham HB, Weis JJ, Taveras LR, Huerta S. Mesh migration following abdominal hernia repair: a comprehensive review. Hernia. 2019 Apr;23(2):235-243. doi: 10.1007/s10029-019-01898-9. Epub 2019 Jan 30. PMID: 30701369.""",
+        es_client=es_client)
+
+    assert matches
+    assert matches[0].status.name == "EXACT"
+    assert matches[0].release.ext_ids.doi == "10.1007/s10029-019-01898-9"
+
+
+def test_close_fuzzy_biblio_matches(es_client):
+
+    matches = close_fuzzy_biblio_matches(dict(
+        title="Mesh migration following abdominal hernia repair: a comprehensive review",
+        first_author="Cunningham",
+        year=2019,
+        journal="Hernia",
+    ),
+                                         es_client=es_client)
+
+    assert matches
+    # TODO: should be "STRONG" or "WEAK" without all authors?
+    assert matches[0].status.name in ("STRONG", "WEAK", "AMBIGUOUS")
+    assert matches[0].release.ext_ids.doi == "10.1007/s10029-019-01898-9"
author	Martin Czygan <martin@archive.org>	2021-04-15 14:11:09 +0000
committer	Martin Czygan <martin@archive.org>	2021-04-15 14:11:09 +0000
commit	b27c43071ab021e9595457999359009cfd7a1abb (patch)
tree	e00199889528c00f777f5bbc908d0962760fb96f
parent	8a17311c9516e63aeb31111647fdf21083bcf928 (diff)
parent	d44a9e421edfec2cac16048b67e6809cae8cdd18 (diff)
download	fuzzycat-b27c43071ab021e9595457999359009cfd7a1abb.tar.gz fuzzycat-b27c43071ab021e9595457999359009cfd7a1abb.zip