diff options
| -rwxr-xr-x | fuzzycat/grobid2json.py | 213 | ||||
| -rw-r--r-- | fuzzycat/grobid_unstructured.py | 61 | ||||
| -rw-r--r-- | setup.py | 1 | ||||
| -rw-r--r-- | tests/test_grobid_unstructured.py | 58 | 
4 files changed, 56 insertions, 277 deletions
| diff --git a/fuzzycat/grobid2json.py b/fuzzycat/grobid2json.py deleted file mode 100755 index c5aa0d2..0000000 --- a/fuzzycat/grobid2json.py +++ /dev/null @@ -1,213 +0,0 @@ -""" -This file originally copied from the webgroup/sandcrawler repository. - -Parse GROBID TEI-XML and extract metadata into simple dict format. In -particular, for fuzzycat, used with GROBID to parse raw ("unstructured") -citation strings. -""" - -import argparse -import io -import json -import xml.etree.ElementTree as ET -from typing import Any, AnyStr, Dict, List, Optional - -xml_ns = "http://www.w3.org/XML/1998/namespace" -ns = "http://www.tei-c.org/ns/1.0" - - -def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any]]: -    if not elem: -        return [] -    names = [] -    for author in elem.findall(".//{%s}author" % ns): -        pn = author.find("./{%s}persName" % ns) -        if not pn: -            continue -        given_name = pn.findtext("./{%s}forename" % ns) or None -        surname = pn.findtext("./{%s}surname" % ns) or None -        full_name = " ".join(pn.itertext()).strip() -        full_name = " ".join(full_name.split()) -        obj: Dict[str, Any] = dict(name=full_name) -        if given_name: -            obj["given_name"] = given_name -        if surname: -            obj["surname"] = surname -        ae = author.find("./{%s}affiliation" % ns) -        if ae: -            affiliation: Dict[str, Any] = dict() -            for on in ae.findall("./{%s}orgName" % ns): -                on_type = on.get("type") -                if on_type: -                    affiliation[on_type] = on.text -            addr_e = ae.find("./{%s}address" % ns) -            if addr_e: -                address = dict() -                for t in addr_e.getchildren(): -                    address[t.tag.split("}")[-1]] = t.text -                if address: -                    affiliation["address"] = address -                # affiliation['address'] = { -                #    'post_code': addr.findtext('./{%s}postCode' % ns) or None, -                #    'settlement': addr.findtext('./{%s}settlement' % ns) or None, -                #    'country': addr.findtext('./{%s}country' % ns) or None, -                # } -            obj["affiliation"] = affiliation -        names.append(obj) -    return names - - -def journal_info(elem: ET.Element) -> Dict[str, Any]: -    journal = dict() -    journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") -    journal["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") -    if journal["publisher"] == "": -        journal["publisher"] = None -    journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) -    journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns) -    journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) -    journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) -    keys = list(journal.keys()) - -    # remove empty/null keys -    for k in keys: -        if not journal[k]: -            journal.pop(k) -    return journal - - -def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: -    ref: Dict[str, Any] = dict() -    ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") -    ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) -    # Title stuff is messy in references... -    ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") -    other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") -    if other_title: -        if ref["title"]: -            ref["journal"] = other_title -        else: -            ref["journal"] = None -            ref["title"] = other_title -    ref["authors"] = all_authors(elem, ns=ns) -    ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") -    if not ref["publisher"]: -        ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") -    if ref["publisher"] == "": -        ref["publisher"] = None -    date = elem.find('.//{%s}date[@type="published"]' % ns) -    ref["date"] = (date is not None) and date.attrib.get("when") -    ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) -    ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) -    ref["doi"] = elem.findtext('.//{%s}idno[@type="DOI"]' % ns) -    ref["arxiv_id"] = elem.findtext('.//{%s}idno[@type="arXiv"]' % ns) -    if ref["arxiv_id"] and ref["arxiv_id"].startswith("arXiv:"): -        ref["arxiv_id"] = ref["arxiv_id"][6:] -    ref["pmcid"] = elem.findtext('.//{%s}idno[@type="PMCID"]' % ns) -    ref["pmid"] = elem.findtext('.//{%s}idno[@type="PMID"]' % ns) -    el = elem.find('.//{%s}biblScope[@unit="page"]' % ns) -    if el is not None: -        if el.attrib.get("from") and el.attrib.get("to"): -            ref["pages"] = "{}-{}".format(el.attrib["from"], el.attrib["to"]) -        else: -            ref["pages"] = el.text -    el = elem.find(".//{%s}ptr[@target]" % ns) -    if el is not None: -        ref["url"] = el.attrib["target"] -        # Hand correction -        if ref["url"].endswith(".Lastaccessed"): -            ref["url"] = ref["url"].replace(".Lastaccessed", "") -        if ref["url"].startswith("<"): -            ref["url"] = ref["url"][1:] -        if ">" in ref["url"]: -            ref["url"] = ref["url"].split(">")[0] -    else: -        ref["url"] = None -    return ref - - -def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: - -    if isinstance(content, str): -        tree = ET.parse(io.StringIO(content)) -    elif isinstance(content, bytes): -        tree = ET.parse(io.BytesIO(content)) - -    info: Dict[str, Any] = dict() - -    # print(content) -    # print(content.getvalue()) -    tei = tree.getroot() - -    header = tei.find(".//{%s}teiHeader" % ns) -    if header is None: -        raise ValueError("XML does not look like TEI format") -    application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0] -    info["grobid_version"] = application_tag.attrib["version"].strip() -    info["grobid_timestamp"] = application_tag.attrib["when"].strip() -    info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") -    info["authors"] = all_authors(header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) -    info["journal"] = journal_info(header) -    date = header.find('.//{%s}date[@type="published"]' % ns) -    info["date"] = (date is not None) and date.attrib.get("when") -    info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) -    info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) -    if info["doi"]: -        info["doi"] = info["doi"].lower() - -    refs = [] -    for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): -        ref = biblio_info(bs) -        ref["index"] = i -        refs.append(ref) -    info["citations"] = refs - -    text = tei.find(".//{%s}text" % (ns)) -    # print(text.attrib) -    if text and text.attrib.get("{%s}lang" % xml_ns): -        info["language_code"] = text.attrib["{%s}lang" % xml_ns]  # xml:lang - -    if encumbered: -        el = tei.find(f".//{{{ns}}}profileDesc/{{{ns}}}abstract") -        info["abstract"] = (el or None) and " ".join(el.itertext()).strip() -        el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") -        info["body"] = (el or None) and " ".join(el.itertext()).strip() -        el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') -        info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip() -        el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') -        info["annex"] = (el or None) and " ".join(el.itertext()).strip() - -    # remove empty/null keys -    keys = list(info.keys()) -    for k in keys: -        if not info[k]: -            info.pop(k) -    return info - - -def main() -> None:  # pragma no cover -    parser = argparse.ArgumentParser( -        formatter_class=argparse.ArgumentDefaultsHelpFormatter, -        description="GROBID TEI XML to JSON", -        usage="%(prog)s [options] <teifile>...", -    ) -    parser.add_argument( -        "--no-encumbered", -        action="store_true", -        help="don't include ambiguously copyright encumbered fields (eg, abstract, body)", -    ) -    parser.add_argument("teifiles", nargs="+") - -    args = parser.parse_args() - -    for filename in args.teifiles: -        content = open(filename, "r").read() -        print( -            json.dumps( -                teixml2json(content, encumbered=(not args.no_encumbered)), -                sort_keys=True, -            )) - - -if __name__ == "__main__":  # pragma no cover -    main() diff --git a/fuzzycat/grobid_unstructured.py b/fuzzycat/grobid_unstructured.py index 5462ae1..7470cd7 100644 --- a/fuzzycat/grobid_unstructured.py +++ b/fuzzycat/grobid_unstructured.py @@ -15,9 +15,9 @@ from typing import Optional  import requests  from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds +from grobid_tei_xml import parse_citation_xml, GrobidBiblio  from fuzzycat.config import settings -from fuzzycat.grobid2json import biblio_info  from fuzzycat.utils import clean_doi  GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki") @@ -55,52 +55,37 @@ def grobid_api_process_citation(raw_citation: str,      return grobid_response.text or None -def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: +def grobid_ref_to_release(ref: GrobidBiblio) -> ReleaseEntity:      """ -    Parses GROBID XML for the case of a single reference/citation string (eg, -    not a full/propper TEI-XML fulltext document), and returns a dict. -    """ -    # first, remove any xmlns stuff, for consistent parsign -    raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") -    tree = ET.parse(io.StringIO(raw_xml)) -    root = tree.getroot() -    ref = biblio_info(root, ns="") -    if not any(ref.values()): -        return None -    return ref - - -def grobid_ref_to_release(ref: dict) -> ReleaseEntity: -    """ -    Takes the dict returned by transform_grobid_ref_xml() and returns a partial +    Takes GrobidBiblio (parsed from TEI-XML) and returns a partial      ReleaseEntity object (for use with fuzzycat)      """      contribs = [] -    for author in ref.get("authors") or []: +    for author in ref.authors or []:          contribs.append(              ReleaseContrib( -                raw_name=author.get("name"), -                given_name=author.get("given_name"), -                surname=author.get("surname"), +                raw_name=author.full_name, +                given_name=author.given_name, +                surname=author.surname,              ))      release = ReleaseEntity( -        title=ref.get("title"), +        title=ref.title,          contribs=contribs, -        volume=ref.get("volume"), -        issue=ref.get("issue"), -        pages=ref.get("pages"), +        volume=ref.volume, +        issue=ref.issue, +        pages=ref.pages,          ext_ids=ReleaseExtIds( -            doi=clean_doi(ref.get("doi")), -            pmid=ref.get("pmid"), -            pmcid=ref.get("pmcid"), -            arxiv=ref.get("arxiv_id"), +            doi=clean_doi(ref.doi), +            pmid=ref.pmid, +            pmcid=ref.pmcid, +            arxiv=ref.arxiv_id,          ),      ) -    if ref.get("journal"): -        release.extra = {"container_name": ref.get("journal")} -    if ref.get("date"): -        if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit(): -            release.release_year = int(ref["date"][0:4]) +    if ref.journal: +        release.extra = {"container_name": ref.journal} +    if ref.date: +        if len(ref.date) >= 4 and ref.date[0:4].isdigit(): +            release.release_year = int(ref.date[0:4])          # TODO: try to parse 'date' into an ISO date format, and assign to release_date?      return release @@ -121,7 +106,7 @@ def grobid_parse_unstructured(raw_citation: str,                                            timeout=timeout)      if not ref_xml:          return None -    biblio_dict = transform_grobid_ref_xml(ref_xml) -    if not biblio_dict: +    ref = parse_citation_xml(ref_xml) +    if not ref:          return None -    return grobid_ref_to_release(biblio_dict) +    return grobid_ref_to_release(ref) @@ -39,6 +39,7 @@ with open("README.md", "r") as fh:              "toml",              "unidecode>=0.10",              "zstandard", +            "grobid_tei_xml==0.1.*",          ],          extras_require={"dev": [              "ipython", diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py index dd69936..cf71f91 100644 --- a/tests/test_grobid_unstructured.py +++ b/tests/test_grobid_unstructured.py @@ -1,39 +1,43 @@  import pytest -from fuzzycat.grobid_unstructured import grobid_api_process_citation, grobid_parse_unstructured, grobid_ref_to_release, transform_grobid_ref_xml +from grobid_tei_xml import parse_citation_xml +from grobid_tei_xml.types import GrobidBiblio, GrobidAuthor + +from fuzzycat.grobid_unstructured import grobid_api_process_citation, grobid_parse_unstructured, grobid_ref_to_release  def test_grobid_ref_to_release(): -    d = { -        'title': -        "some title", -        'doi': -        '10.1234/5678', -        'journal': -        'some journal', -        'authors': [ -            { -                'name': 'ahab sailor', -                'given_name': 'ahab', -                'surname': 'sailor' -            }, -            { -                'name': 'mary jane', -                'given_name': 'mary', -                'surname': 'jane' -            }, +    d = GrobidBiblio( +        title="some title", +        doi='10.1234/5678', +        journal='some journal', +        authors=[ +            GrobidAuthor( +                full_name='ahab sailor', +                given_name='ahab', +                surname='sailor', +            ), +            GrobidAuthor( +                full_name='mary jane', +                given_name='mary', +                surname='jane' +            ),          ], -    } +    )      r = grobid_ref_to_release(d) -    assert r.title == d['title'] -    assert r.ext_ids.doi == d['doi'] -    assert r.extra['container_name'] == d['journal'] -    assert r.contribs[0].surname == d['authors'][0]['surname'] -    assert r.contribs[1].raw_name == d['authors'][1]['name'] +    assert r.title == d.title +    assert r.ext_ids.doi == d.doi +    assert r.extra['container_name'] == d.journal +    assert r.contribs[0].surname == d.authors[0].surname +    assert r.contribs[1].raw_name == d.authors[1].full_name  def test_transform_grobid_ref_xml(): +    """ +    This used to be a test of the grobid2json file in this repository. Now it +    is a backwards compatibility test for grobid_tei_xml +    """      citation_xml = """  <biblStruct >      <analytic> @@ -83,7 +87,9 @@ def test_transform_grobid_ref_xml():      </monogr>  </biblStruct>""" -    d = transform_grobid_ref_xml(citation_xml) +    citation = parse_citation_xml(citation_xml) +    assert citation +    d = citation.to_legacy_dict()      assert d['title'] == "Mesh migration following abdominal hernia repair: a comprehensive review"      assert d['authors'][2]['given_name'] == "L"      assert d['authors'][2]['surname'] == "Taveras" | 
