diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-21 14:05:26 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-21 14:05:26 -0700 |
commit | d58721ddb034363db10eb013986f560a87df7a19 (patch) | |
tree | 2f25cf50a638aa6db08d9470ecc8243db869b00c /grobid_tei_xml | |
parent | c1059b97772e91468edbcacab6bd830ca1ffaa81 (diff) | |
download | grobid_tei_xml-d58721ddb034363db10eb013986f560a87df7a19.tar.gz grobid_tei_xml-d58721ddb034363db10eb013986f560a87df7a19.zip |
make fmt
Diffstat (limited to 'grobid_tei_xml')
-rwxr-xr-x | grobid_tei_xml/grobid2json.py | 44 | ||||
-rw-r--r-- | grobid_tei_xml/grobid_unstructured.py | 1 |
2 files changed, 25 insertions, 20 deletions
diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py index ac0710c..5edee36 100755 --- a/grobid_tei_xml/grobid2json.py +++ b/grobid_tei_xml/grobid2json.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ NB: adapted to work as a library for PDF extraction. Will probably be re-written eventually to be correct, complete, and robust; this is just a @@ -25,17 +24,18 @@ Prints JSON to stdout, errors to stderr This file copied from the sandcrawler repository. """ +import argparse import io import json -import argparse import xml.etree.ElementTree as ET -from typing import List, Any, Dict, AnyStr, Optional +from typing import Any, AnyStr, Dict, List, Optional xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" -def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any]]: +def all_authors(elem: Optional[ET.Element], + ns: str = ns) -> List[Dict[str, Any]]: if not elem: return [] names = [] @@ -45,7 +45,8 @@ def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any] continue given_name = pn.findtext("./{%s}forename" % ns) or None surname = pn.findtext("./{%s}surname" % ns) or None - full_name = " ".join([t.strip() for t in pn.itertext() if t.strip()]).strip() + full_name = " ".join([t.strip() for t in pn.itertext() + if t.strip()]).strip() obj: Dict[str, Any] = dict(name=full_name) if given_name: obj["given_name"] = given_name @@ -79,8 +80,7 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]: journal = dict() journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") journal["publisher"] = elem.findtext( - f".//{{{ns}}}publicationStmt/{{{ns}}}publisher" - ) + f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if journal["publisher"] == "": journal["publisher"] = None journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) @@ -99,7 +99,8 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]: def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") - ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) + ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % + ns) # Title stuff is messy in references... ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") @@ -110,9 +111,11 @@ def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: ref["journal"] = None ref["title"] = other_title ref["authors"] = all_authors(elem, ns=ns) - ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + ref["publisher"] = elem.findtext( + f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if not ref["publisher"]: - ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") + ref["publisher"] = elem.findtext( + f".//{{{ns}}}imprint/{{{ns}}}publisher") if ref["publisher"] == "": ref["publisher"] = None date = elem.find('.//{%s}date[@type="published"]' % ns) @@ -162,23 +165,25 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: header = tei.find(".//{%s}teiHeader" % ns) if header is None: raise ValueError("XML does not look like TEI format") - application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0] + application_tag = header.findall( + f".//{{{ns}}}appInfo/{{{ns}}}application")[0] info["grobid_version"] = application_tag.attrib["version"].strip() info["grobid_timestamp"] = application_tag.attrib["when"].strip() info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") info["authors"] = all_authors( - header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct") - ) + header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) info["journal"] = journal_info(header) date = header.find('.//{%s}date[@type="published"]' % ns) info["date"] = (date is not None) and date.attrib.get("when") - info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) + info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % + ns) info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) if info["doi"]: info["doi"] = info["doi"].lower() refs = [] - for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): + for (i, bs) in enumerate( + tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): ref = biblio_info(bs) ref["index"] = i refs.append(ref) @@ -195,7 +200,8 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") info["body"] = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') - info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip() + info["acknowledgement"] = (el or None) and " ".join( + el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') info["annex"] = (el or None) and " ".join(el.itertext()).strip() @@ -216,7 +222,8 @@ def main() -> None: # pragma no cover parser.add_argument( "--no-encumbered", action="store_true", - help="don't include ambiguously copyright encumbered fields (eg, abstract, body)", + help= + "don't include ambiguously copyright encumbered fields (eg, abstract, body)", ) parser.add_argument("teifiles", nargs="+") @@ -228,8 +235,7 @@ def main() -> None: # pragma no cover json.dumps( teixml2json(content, encumbered=(not args.no_encumbered)), sort_keys=True, - ) - ) + )) if __name__ == "__main__": # pragma no cover diff --git a/grobid_tei_xml/grobid_unstructured.py b/grobid_tei_xml/grobid_unstructured.py index 83808e0..eeb3507 100644 --- a/grobid_tei_xml/grobid_unstructured.py +++ b/grobid_tei_xml/grobid_unstructured.py @@ -29,4 +29,3 @@ def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: if not any(ref.values()): return None return ref - |