diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-17 11:30:27 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-17 11:30:27 -0700 |
commit | 55ec15bb1aa158f783ec359ad1a27c979edc233d (patch) | |
tree | 3fa8eced612ee0569e65c991166ff56ad0a9fcdc | |
parent | 84169ce643075f1e49b18744d5609c7f1c48e7f7 (diff) | |
download | sandcrawler-55ec15bb1aa158f783ec359ad1a27c979edc233d.tar.gz sandcrawler-55ec15bb1aa158f783ec359ad1a27c979edc233d.zip |
update grobid2json with type annotations
-rwxr-xr-x | python/grobid2json.py | 204 |
1 files changed, 110 insertions, 94 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py index 39ab222..0eae6fe 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -27,56 +27,64 @@ import io import json import argparse import xml.etree.ElementTree as ET +from typing import List, Any, Dict, AnyStr, Optional xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" -def all_authors(elem): + +def all_authors(elem: Optional[ET.Element]) -> List[Dict[str, Any]]: + if not elem: + return [] names = [] - for author in elem.findall('.//{%s}author' % ns): - pn = author.find('./{%s}persName' % ns) + for author in elem.findall(".//{%s}author" % ns): + pn = author.find("./{%s}persName" % ns) if not pn: continue - given_name = pn.findtext('./{%s}forename' % ns) or None - surname = pn.findtext('./{%s}surname' % ns) or None - full_name = ' '.join(pn.itertext()) - obj = dict(name=full_name) + given_name = pn.findtext("./{%s}forename" % ns) or None + surname = pn.findtext("./{%s}surname" % ns) or None + full_name = " ".join(pn.itertext()) + obj: Dict[str, Any] = dict(name=full_name) if given_name: - obj['given_name'] = given_name + obj["given_name"] = given_name if surname: - obj['surname'] = surname - ae = author.find('./{%s}affiliation' % ns) + obj["surname"] = surname + ae = author.find("./{%s}affiliation" % ns) if ae: - affiliation = dict() - for on in ae.findall('./{%s}orgName' % ns): - affiliation[on.get('type')] = on.text - addr_e = ae.find('./{%s}address' % ns) + affiliation: Dict[str, Any] = dict() + for on in ae.findall("./{%s}orgName" % ns): + on_type = on.get("type") + if on_type: + affiliation[on_type] = on.text + addr_e = ae.find("./{%s}address" % ns) if addr_e: address = dict() for t in addr_e.getchildren(): - address[t.tag.split('}')[-1]] = t.text + address[t.tag.split("}")[-1]] = t.text if address: - affiliation['address'] = address - #affiliation['address'] = { + affiliation["address"] = address + # affiliation['address'] = { # 'post_code': addr.findtext('./{%s}postCode' % ns) or None, # 'settlement': addr.findtext('./{%s}settlement' % ns) or None, # 'country': addr.findtext('./{%s}country' % ns) or None, - #} - obj['affiliation'] = affiliation + # } + obj["affiliation"] = affiliation names.append(obj) return names -def journal_info(elem): +def journal_info(elem: ET.Element) -> Dict[str, Any]: journal = dict() - journal['name'] = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns)) - journal['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns)) - if journal['publisher'] == '': - journal['publisher'] = None - journal['issn'] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) - journal['eissn'] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns) - journal['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) - journal['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) + journal["name"] = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns)) + journal["publisher"] = elem.findtext( + ".//{%s}publicationStmt/{%s}publisher" % (ns, ns) + ) + if journal["publisher"] == "": + journal["publisher"] = None + journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) + journal["eissn"] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns) + journal["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) + journal["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) keys = list(journal.keys()) # remove empty/null keys @@ -86,88 +94,89 @@ def journal_info(elem): return journal -def biblio_info(elem): - ref = dict() - ref['id'] = elem.attrib.get('{http://www.w3.org/XML/1998/namespace}id') +def biblio_info(elem: ET.Element) -> Dict[str, Any]: + ref: Dict[str, Any] = dict() + ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") # Title stuff is messy in references... - ref['title'] = elem.findtext('.//{%s}analytic/{%s}title' % (ns, ns)) - other_title = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns)) + ref["title"] = elem.findtext(".//{%s}analytic/{%s}title" % (ns, ns)) + other_title = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns)) if other_title: - if ref['title']: - ref['journal'] = other_title + if ref["title"]: + ref["journal"] = other_title else: - ref['journal'] = None - ref['title'] = other_title - ref['authors'] = all_authors(elem) - ref['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns)) - if ref['publisher'] == '': - ref['publisher'] = None + ref["journal"] = None + ref["title"] = other_title + ref["authors"] = all_authors(elem) + ref["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns)) + if ref["publisher"] == "": + ref["publisher"] = None date = elem.find('.//{%s}date[@type="published"]' % ns) - ref['date'] = (date != None) and date.attrib.get('when') - ref['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) - ref['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) - el = elem.find('.//{%s}ptr[@target]' % ns) + ref["date"] = (date is not None) and date.attrib.get("when") + ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) + ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) + el = elem.find(".//{%s}ptr[@target]" % ns) if el is not None: - ref['url'] = el.attrib['target'] + ref["url"] = el.attrib["target"] # Hand correction - if ref['url'].endswith(".Lastaccessed"): - ref['url'] = ref['url'].replace(".Lastaccessed", "") + if ref["url"].endswith(".Lastaccessed"): + ref["url"] = ref["url"].replace(".Lastaccessed", "") else: - ref['url'] = None + ref["url"] = None return ref -def teixml2json(content, encumbered=True): +def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: - if type(content) == str: - content = io.StringIO(content) - elif type(content) == bytes: - content = io.BytesIO(content) + if isinstance(content, str): + tree = ET.parse(io.StringIO(content)) + elif isinstance(content, bytes): + tree = ET.parse(io.BytesIO(content)) - info = dict() + info: Dict[str, Any] = dict() - #print(content) - #print(content.getvalue()) - tree = ET.parse(content) + # print(content) + # print(content.getvalue()) tei = tree.getroot() - header = tei.find('.//{%s}teiHeader' % ns) + header = tei.find(".//{%s}teiHeader" % ns) if header is None: raise ValueError("XML does not look like TEI format") - application_tag = header.findall('.//{%s}appInfo/{%s}application' % (ns, ns))[0] - info['grobid_version'] = application_tag.attrib['version'].strip() - info['grobid_timestamp'] = application_tag.attrib['when'].strip() - info['title'] = header.findtext('.//{%s}analytic/{%s}title' % (ns, ns)) - info['authors'] = all_authors(header.find('.//{%s}sourceDesc/{%s}biblStruct' % (ns, ns))) - info['journal'] = journal_info(header) + application_tag = header.findall(".//{%s}appInfo/{%s}application" % (ns, ns))[0] + info["grobid_version"] = application_tag.attrib["version"].strip() + info["grobid_timestamp"] = application_tag.attrib["when"].strip() + info["title"] = header.findtext(".//{%s}analytic/{%s}title" % (ns, ns)) + info["authors"] = all_authors( + header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns)) + ) + info["journal"] = journal_info(header) date = header.find('.//{%s}date[@type="published"]' % ns) - info['date'] = (date != None) and date.attrib.get('when') - info['fatcat_release'] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) - info['doi'] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) - if info['doi']: - info['doi'] = info['doi'].lower() + info["date"] = (date is not None) and date.attrib.get("when") + info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) + info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) + if info["doi"]: + info["doi"] = info["doi"].lower() refs = [] - for (i, bs) in enumerate(tei.findall('.//{%s}listBibl/{%s}biblStruct' % (ns, ns))): + for (i, bs) in enumerate(tei.findall(".//{%s}listBibl/{%s}biblStruct" % (ns, ns))): ref = biblio_info(bs) - ref['index'] = i + ref["index"] = i refs.append(ref) - info['citations'] = refs + info["citations"] = refs - text = tei.find('.//{%s}text' % (ns)) - #print(text.attrib) - if text.attrib.get('{%s}lang' % xml_ns): - info['language_code'] = text.attrib['{%s}lang' % xml_ns] # xml:lang + text = tei.find(".//{%s}text" % (ns)) + # print(text.attrib) + if text and text.attrib.get("{%s}lang" % xml_ns): + info["language_code"] = text.attrib["{%s}lang" % xml_ns] # xml:lang if encumbered: - el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns)) - info['abstract'] = (el or None) and " ".join(el.itertext()).strip() - el = tei.find('.//{%s}text/{%s}body' % (ns, ns)) - info['body'] = (el or None) and " ".join(el.itertext()).strip() + el = tei.find(".//{%s}profileDesc/{%s}abstract" % (ns, ns)) + info["abstract"] = (el or None) and " ".join(el.itertext()).strip() + el = tei.find(".//{%s}text/{%s}body" % (ns, ns)) + info["body"] = (el or None) and " ".join(el.itertext()).strip() el = tei.find('.//{%s}back/{%s}div[@type="acknowledgement"]' % (ns, ns)) - info['acknowledgement'] = (el or None) and " ".join(el.itertext()).strip() + info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip() el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns)) - info['annex'] = (el or None) and " ".join(el.itertext()).strip() + info["annex"] = (el or None) and " ".join(el.itertext()).strip() # remove empty/null keys keys = list(info.keys()) @@ -176,24 +185,31 @@ def teixml2json(content, encumbered=True): info.pop(k) return info -def main(): # pragma no cover + +def main() -> None: # pragma no cover parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="GROBID TEI XML to JSON", - usage="%(prog)s [options] <teifile>...") - parser.add_argument("--no-encumbered", + usage="%(prog)s [options] <teifile>...", + ) + parser.add_argument( + "--no-encumbered", action="store_true", - help="don't include ambiguously copyright encumbered fields (eg, abstract, body)") - parser.add_argument("teifiles", nargs='+') + help="don't include ambiguously copyright encumbered fields (eg, abstract, body)", + ) + parser.add_argument("teifiles", nargs="+") args = parser.parse_args() for filename in args.teifiles: - content = open(filename, 'r') - print(json.dumps( - teixml2json(content, - encumbered=(not args.no_encumbered)), - sort_keys=True)) + content = open(filename, "r").read() + print( + json.dumps( + teixml2json(content, encumbered=(not args.no_encumbered)), + sort_keys=True, + ) + ) + -if __name__=='__main__': # pragma no cover +if __name__ == "__main__": # pragma no cover main() |