diff options
author | Martin Czygan <martin@archive.org> | 2021-11-04 17:05:09 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2021-11-04 17:05:09 +0000 |
commit | 282f315c6ba3643c8c614220ab2f7e1d55de3658 (patch) | |
tree | ccb9e818e18492708d90411cbe2ff7ba8ce0f5ca | |
parent | 615439df4955ca19bf3fdfa10b41b7d8950b3e63 (diff) | |
parent | 2f41335d268b0e2705a1ebff0ff104e965630837 (diff) | |
download | fuzzycat-282f315c6ba3643c8c614220ab2f7e1d55de3658.tar.gz fuzzycat-282f315c6ba3643c8c614220ab2f7e1d55de3658.zip |
Merge branch 'bnewbold-grobid-tei-xml' into 'master'
use grobid_tei_xml for grobid unstructured lookups
See merge request webgroup/fuzzycat!9
-rwxr-xr-x | fuzzycat/grobid2json.py | 213 | ||||
-rw-r--r-- | fuzzycat/grobid_unstructured.py | 61 | ||||
-rw-r--r-- | setup.py | 1 | ||||
-rw-r--r-- | tests/test_grobid_unstructured.py | 58 |
4 files changed, 56 insertions, 277 deletions
diff --git a/fuzzycat/grobid2json.py b/fuzzycat/grobid2json.py deleted file mode 100755 index c5aa0d2..0000000 --- a/fuzzycat/grobid2json.py +++ /dev/null @@ -1,213 +0,0 @@ -""" -This file originally copied from the webgroup/sandcrawler repository. - -Parse GROBID TEI-XML and extract metadata into simple dict format. In -particular, for fuzzycat, used with GROBID to parse raw ("unstructured") -citation strings. -""" - -import argparse -import io -import json -import xml.etree.ElementTree as ET -from typing import Any, AnyStr, Dict, List, Optional - -xml_ns = "http://www.w3.org/XML/1998/namespace" -ns = "http://www.tei-c.org/ns/1.0" - - -def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any]]: - if not elem: - return [] - names = [] - for author in elem.findall(".//{%s}author" % ns): - pn = author.find("./{%s}persName" % ns) - if not pn: - continue - given_name = pn.findtext("./{%s}forename" % ns) or None - surname = pn.findtext("./{%s}surname" % ns) or None - full_name = " ".join(pn.itertext()).strip() - full_name = " ".join(full_name.split()) - obj: Dict[str, Any] = dict(name=full_name) - if given_name: - obj["given_name"] = given_name - if surname: - obj["surname"] = surname - ae = author.find("./{%s}affiliation" % ns) - if ae: - affiliation: Dict[str, Any] = dict() - for on in ae.findall("./{%s}orgName" % ns): - on_type = on.get("type") - if on_type: - affiliation[on_type] = on.text - addr_e = ae.find("./{%s}address" % ns) - if addr_e: - address = dict() - for t in addr_e.getchildren(): - address[t.tag.split("}")[-1]] = t.text - if address: - affiliation["address"] = address - # affiliation['address'] = { - # 'post_code': addr.findtext('./{%s}postCode' % ns) or None, - # 'settlement': addr.findtext('./{%s}settlement' % ns) or None, - # 'country': addr.findtext('./{%s}country' % ns) or None, - # } - obj["affiliation"] = affiliation - names.append(obj) - return names - - -def journal_info(elem: ET.Element) -> Dict[str, Any]: - journal = dict() - journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") - journal["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") - if journal["publisher"] == "": - journal["publisher"] = None - journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) - journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns) - journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) - journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) - keys = list(journal.keys()) - - # remove empty/null keys - for k in keys: - if not journal[k]: - journal.pop(k) - return journal - - -def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: - ref: Dict[str, Any] = dict() - ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") - ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) - # Title stuff is messy in references... - ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") - other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") - if other_title: - if ref["title"]: - ref["journal"] = other_title - else: - ref["journal"] = None - ref["title"] = other_title - ref["authors"] = all_authors(elem, ns=ns) - ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") - if not ref["publisher"]: - ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") - if ref["publisher"] == "": - ref["publisher"] = None - date = elem.find('.//{%s}date[@type="published"]' % ns) - ref["date"] = (date is not None) and date.attrib.get("when") - ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) - ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) - ref["doi"] = elem.findtext('.//{%s}idno[@type="DOI"]' % ns) - ref["arxiv_id"] = elem.findtext('.//{%s}idno[@type="arXiv"]' % ns) - if ref["arxiv_id"] and ref["arxiv_id"].startswith("arXiv:"): - ref["arxiv_id"] = ref["arxiv_id"][6:] - ref["pmcid"] = elem.findtext('.//{%s}idno[@type="PMCID"]' % ns) - ref["pmid"] = elem.findtext('.//{%s}idno[@type="PMID"]' % ns) - el = elem.find('.//{%s}biblScope[@unit="page"]' % ns) - if el is not None: - if el.attrib.get("from") and el.attrib.get("to"): - ref["pages"] = "{}-{}".format(el.attrib["from"], el.attrib["to"]) - else: - ref["pages"] = el.text - el = elem.find(".//{%s}ptr[@target]" % ns) - if el is not None: - ref["url"] = el.attrib["target"] - # Hand correction - if ref["url"].endswith(".Lastaccessed"): - ref["url"] = ref["url"].replace(".Lastaccessed", "") - if ref["url"].startswith("<"): - ref["url"] = ref["url"][1:] - if ">" in ref["url"]: - ref["url"] = ref["url"].split(">")[0] - else: - ref["url"] = None - return ref - - -def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: - - if isinstance(content, str): - tree = ET.parse(io.StringIO(content)) - elif isinstance(content, bytes): - tree = ET.parse(io.BytesIO(content)) - - info: Dict[str, Any] = dict() - - # print(content) - # print(content.getvalue()) - tei = tree.getroot() - - header = tei.find(".//{%s}teiHeader" % ns) - if header is None: - raise ValueError("XML does not look like TEI format") - application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0] - info["grobid_version"] = application_tag.attrib["version"].strip() - info["grobid_timestamp"] = application_tag.attrib["when"].strip() - info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") - info["authors"] = all_authors(header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) - info["journal"] = journal_info(header) - date = header.find('.//{%s}date[@type="published"]' % ns) - info["date"] = (date is not None) and date.attrib.get("when") - info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) - info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) - if info["doi"]: - info["doi"] = info["doi"].lower() - - refs = [] - for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): - ref = biblio_info(bs) - ref["index"] = i - refs.append(ref) - info["citations"] = refs - - text = tei.find(".//{%s}text" % (ns)) - # print(text.attrib) - if text and text.attrib.get("{%s}lang" % xml_ns): - info["language_code"] = text.attrib["{%s}lang" % xml_ns] # xml:lang - - if encumbered: - el = tei.find(f".//{{{ns}}}profileDesc/{{{ns}}}abstract") - info["abstract"] = (el or None) and " ".join(el.itertext()).strip() - el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") - info["body"] = (el or None) and " ".join(el.itertext()).strip() - el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') - info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip() - el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') - info["annex"] = (el or None) and " ".join(el.itertext()).strip() - - # remove empty/null keys - keys = list(info.keys()) - for k in keys: - if not info[k]: - info.pop(k) - return info - - -def main() -> None: # pragma no cover - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description="GROBID TEI XML to JSON", - usage="%(prog)s [options] <teifile>...", - ) - parser.add_argument( - "--no-encumbered", - action="store_true", - help="don't include ambiguously copyright encumbered fields (eg, abstract, body)", - ) - parser.add_argument("teifiles", nargs="+") - - args = parser.parse_args() - - for filename in args.teifiles: - content = open(filename, "r").read() - print( - json.dumps( - teixml2json(content, encumbered=(not args.no_encumbered)), - sort_keys=True, - )) - - -if __name__ == "__main__": # pragma no cover - main() diff --git a/fuzzycat/grobid_unstructured.py b/fuzzycat/grobid_unstructured.py index 5462ae1..7470cd7 100644 --- a/fuzzycat/grobid_unstructured.py +++ b/fuzzycat/grobid_unstructured.py @@ -15,9 +15,9 @@ from typing import Optional import requests from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds +from grobid_tei_xml import parse_citation_xml, GrobidBiblio from fuzzycat.config import settings -from fuzzycat.grobid2json import biblio_info from fuzzycat.utils import clean_doi GROBID_API_BASE = settings.get("GROBID_API_BASE", "https://grobid.qa.fatcat.wiki") @@ -55,52 +55,37 @@ def grobid_api_process_citation(raw_citation: str, return grobid_response.text or None -def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: +def grobid_ref_to_release(ref: GrobidBiblio) -> ReleaseEntity: """ - Parses GROBID XML for the case of a single reference/citation string (eg, - not a full/propper TEI-XML fulltext document), and returns a dict. - """ - # first, remove any xmlns stuff, for consistent parsign - raw_xml = raw_xml.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") - tree = ET.parse(io.StringIO(raw_xml)) - root = tree.getroot() - ref = biblio_info(root, ns="") - if not any(ref.values()): - return None - return ref - - -def grobid_ref_to_release(ref: dict) -> ReleaseEntity: - """ - Takes the dict returned by transform_grobid_ref_xml() and returns a partial + Takes GrobidBiblio (parsed from TEI-XML) and returns a partial ReleaseEntity object (for use with fuzzycat) """ contribs = [] - for author in ref.get("authors") or []: + for author in ref.authors or []: contribs.append( ReleaseContrib( - raw_name=author.get("name"), - given_name=author.get("given_name"), - surname=author.get("surname"), + raw_name=author.full_name, + given_name=author.given_name, + surname=author.surname, )) release = ReleaseEntity( - title=ref.get("title"), + title=ref.title, contribs=contribs, - volume=ref.get("volume"), - issue=ref.get("issue"), - pages=ref.get("pages"), + volume=ref.volume, + issue=ref.issue, + pages=ref.pages, ext_ids=ReleaseExtIds( - doi=clean_doi(ref.get("doi")), - pmid=ref.get("pmid"), - pmcid=ref.get("pmcid"), - arxiv=ref.get("arxiv_id"), + doi=clean_doi(ref.doi), + pmid=ref.pmid, + pmcid=ref.pmcid, + arxiv=ref.arxiv_id, ), ) - if ref.get("journal"): - release.extra = {"container_name": ref.get("journal")} - if ref.get("date"): - if len(ref["date"]) >= 4 and ref["date"][0:4].isdigit(): - release.release_year = int(ref["date"][0:4]) + if ref.journal: + release.extra = {"container_name": ref.journal} + if ref.date: + if len(ref.date) >= 4 and ref.date[0:4].isdigit(): + release.release_year = int(ref.date[0:4]) # TODO: try to parse 'date' into an ISO date format, and assign to release_date? return release @@ -121,7 +106,7 @@ def grobid_parse_unstructured(raw_citation: str, timeout=timeout) if not ref_xml: return None - biblio_dict = transform_grobid_ref_xml(ref_xml) - if not biblio_dict: + ref = parse_citation_xml(ref_xml) + if not ref: return None - return grobid_ref_to_release(biblio_dict) + return grobid_ref_to_release(ref) @@ -39,6 +39,7 @@ with open("README.md", "r") as fh: "toml", "unidecode>=0.10", "zstandard", + "grobid_tei_xml==0.1.*", ], extras_require={"dev": [ "ipython", diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py index dd69936..cf71f91 100644 --- a/tests/test_grobid_unstructured.py +++ b/tests/test_grobid_unstructured.py @@ -1,39 +1,43 @@ import pytest -from fuzzycat.grobid_unstructured import grobid_api_process_citation, grobid_parse_unstructured, grobid_ref_to_release, transform_grobid_ref_xml +from grobid_tei_xml import parse_citation_xml +from grobid_tei_xml.types import GrobidBiblio, GrobidAuthor + +from fuzzycat.grobid_unstructured import grobid_api_process_citation, grobid_parse_unstructured, grobid_ref_to_release def test_grobid_ref_to_release(): - d = { - 'title': - "some title", - 'doi': - '10.1234/5678', - 'journal': - 'some journal', - 'authors': [ - { - 'name': 'ahab sailor', - 'given_name': 'ahab', - 'surname': 'sailor' - }, - { - 'name': 'mary jane', - 'given_name': 'mary', - 'surname': 'jane' - }, + d = GrobidBiblio( + title="some title", + doi='10.1234/5678', + journal='some journal', + authors=[ + GrobidAuthor( + full_name='ahab sailor', + given_name='ahab', + surname='sailor', + ), + GrobidAuthor( + full_name='mary jane', + given_name='mary', + surname='jane' + ), ], - } + ) r = grobid_ref_to_release(d) - assert r.title == d['title'] - assert r.ext_ids.doi == d['doi'] - assert r.extra['container_name'] == d['journal'] - assert r.contribs[0].surname == d['authors'][0]['surname'] - assert r.contribs[1].raw_name == d['authors'][1]['name'] + assert r.title == d.title + assert r.ext_ids.doi == d.doi + assert r.extra['container_name'] == d.journal + assert r.contribs[0].surname == d.authors[0].surname + assert r.contribs[1].raw_name == d.authors[1].full_name def test_transform_grobid_ref_xml(): + """ + This used to be a test of the grobid2json file in this repository. Now it + is a backwards compatibility test for grobid_tei_xml + """ citation_xml = """ <biblStruct > <analytic> @@ -83,7 +87,9 @@ def test_transform_grobid_ref_xml(): </monogr> </biblStruct>""" - d = transform_grobid_ref_xml(citation_xml) + citation = parse_citation_xml(citation_xml) + assert citation + d = citation.to_legacy_dict() assert d['title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" assert d['authors'][2]['given_name'] == "L" assert d['authors'][2]['surname'] == "Taveras" |